Mercurial > hg > xemacs-beta
comparison lisp/w3/w3-parse.el @ 0:376386a54a3c r19-14
Import from CVS: tag r19-14
author | cvs |
---|---|
date | Mon, 13 Aug 2007 08:45:50 +0200 |
parents | |
children | ac2d302a0011 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:376386a54a3c |
---|---|
1 ;; Created by: Joe Wells, jbw@csb.bu.edu | |
2 ;; Created on: Sat Sep 30 17:25:40 1995 | |
3 ;; Filename: w3-parse.el | |
4 ;; Purpose: Parse HTML and/or SGML for Emacs W3 browser. | |
5 | |
6 ;; Copyright © 1995 Joseph Brian Wells | |
7 ;; Copyright © 1993, 1994, 1995 by William M. Perry (wmperry@spry.com) | |
8 ;; | |
9 ;; This program is free software; you can redistribute it and/or modify | |
10 ;; it under the terms of the GNU General Public License as published by | |
11 ;; the Free Software Foundation; either version 2 of the License, or | |
12 ;; (at your option) any later version. | |
13 ;; | |
14 ;; This program is distributed in the hope that it will be useful, | |
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 ;; GNU General Public License for more details. | |
18 ;; | |
19 ;; You should have received a copy of the GNU General Public License | |
20 ;; along with this program; if not, write to the Free Software | |
21 ;; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
22 ;; | |
23 ;; On November 13, 1995, the license was available at | |
24 ;; <URL:ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0>. It may still be | |
25 ;; obtainable via that URL. | |
26 | |
27 | |
28 ;;; | |
29 ;;; Trying to make the best of an evil speed hack. | |
30 ;;; | |
31 | |
32 ;; Explanation: | |
33 | |
34 ;; Basically, this file provides one big function (w3-parse-buffer) and | |
35 ;; some data structures. However, to avoid code redundancy, I have broken | |
36 ;; out some common subexpressions of w3-parse-buffer into separate | |
37 ;; functions. I have declared these separate functions with "defsubst" so | |
38 ;; they will be inlined into w3-parse-buffer. Also, I have defined them | |
39 ;; within eval-when-compile forms, so no definitions will be emitted into | |
40 ;; the .elc file for these separate functions. (They will work normally | |
41 ;; when the uncompiled file is loaded.) | |
42 | |
43 ;; Each of these subfunctions use some scratch variables in a purely local | |
44 ;; fashion. In good software design, I would declare these variables as | |
45 ;; close to their use as possible with "let". However, "let"-binding | |
46 ;; variables is *SLOW* in Emacs Lisp, even when compiled. Since each of | |
47 ;; these functions is executed one or more time during each iteration of | |
48 ;; the main loop, I deemed this too expensive. So the main function does | |
49 ;; the "let"-binding of these variables. However, I still want to declare | |
50 ;; them close to their use, partially to keep the compiler from crying | |
51 ;; "Wolf!" when there is no danger (well, maybe a little danger :-), so I | |
52 ;; define some macros for this purpose. | |
53 | |
54 ;; Also, there are some variables which are updated throughout the file | |
55 ;; (remember this is really all one function). Some of the code which | |
56 ;; updates them is located inside the subfunctions. So that the compiler | |
57 ;; will not complain, these variables are defined with defvar. | |
58 | |
59 (require 'w3-vars) | |
60 | |
61 (eval-when-compile | |
62 (defconst w3-p-s-var-list nil | |
63 "A list of the scratch variables used by functions called by | |
64 w3-parse-buffer which it is w3-parse-buffer's responsibility to | |
65 \"let\"-bind.") | |
66 | |
67 ;; *** This is unused and does not belong right here anyway. | |
68 (defmacro w3-resolve-numeric-entity (ent) | |
69 "Return a string representing the numeric entity ENT (&#ENT;)" | |
70 (` (if (< (, ent) 256) | |
71 (char-to-string (, ent)) | |
72 (format "[Too large character: %s]" (, ent))))) | |
73 | |
74 (defmacro w3-p-s-var-def (var) | |
75 "Declare VAR as a scratch variable which w3-parse-buffer must | |
76 \"let\"-bind." | |
77 (` (eval-when-compile | |
78 (defvar (, var)) | |
79 (or (memq '(, var) w3-p-s-var-list) | |
80 (setq w3-p-s-var-list (cons '(, var) w3-p-s-var-list)))))) | |
81 | |
82 (defmacro w3-p-s-let-bindings (&rest body) | |
83 "\"let\"-bind all of the variables in w3-p-s-var-list in BODY." | |
84 (` (let (, w3-p-s-var-list) | |
85 (,@ body)))) | |
86 (put 'w3-p-s-let-bindings 'lisp-indent-function 0) | |
87 (put 'w3-p-s-let-bindings 'edebug-form-spec t) | |
88 | |
89 (defvar w3-p-d-current-element) | |
90 (put 'w3-p-d-current-element 'variable-documentation | |
91 "Information structure for the current open element.") | |
92 | |
93 (defvar w3-p-d-exceptions) | |
94 (put 'w3-p-d-exceptions 'variable-documentation | |
95 "Alist specifying elements (dis)allowed because of an (ex|in)clusion | |
96 exception of some containing element (not necessarily the immediately | |
97 containing element). Each item specifies a transition for an element | |
98 which overrides that specified by the current element's content model. | |
99 Each item is of the form (TAG ACTION *same ERRORP).") | |
100 | |
101 (defvar w3-p-d-in-parsed-marked-section) | |
102 (put 'w3-p-d-in-parsed-marked-section 'variable-documentation | |
103 "Are we in a parsed marked section so that we have to scan for \"]]>\"?") | |
104 | |
105 (defvar w3-p-d-non-markup-chars) | |
106 (put 'w3-p-d-non-markup-chars 'variable-documentation | |
107 "The characters that do not indicate the start of markup, in the format | |
108 for an argument to skip-chars-forward.") | |
109 | |
110 (defvar w3-p-d-null-end-tag-enabled) | |
111 (put 'w3-p-d-null-end-tag-enabled 'variable-documentation | |
112 "Is the null end tag (\"/\") enabled?") | |
113 | |
114 (defvar w3-p-d-open-element-stack) | |
115 (put 'w3-p-d-open-element-stack 'variable-documentation | |
116 "A stack of the currently open elements, with the innermost enclosing | |
117 element on top and the outermost on bottom.") | |
118 | |
119 (defvar w3-p-d-parse-tag-stream-tail-pointer) | |
120 (put 'w3-p-d-parse-tag-stream-tail-pointer 'variable-documentation | |
121 "Points to last cons cell in parse-tag stream. We add items to tail of | |
122 parse-tag-stream instead of head.") | |
123 | |
124 (defvar w3-p-d-shortrefs) | |
125 (put 'w3-p-d-shortrefs 'variable-documentation | |
126 "An alist of the magic entity reference strings in the current | |
127 between-tags region and their replacements. Each item is of the format | |
128 \(REGEXP . REPLACEMENT-STRING\). Although in SGML shortrefs normally name | |
129 entities whose value should be used as the replacement, we have | |
130 preexpanded the entities for speed. We have also regexp-quoted the | |
131 strings to be replaced, so they can be used with looking-at. This should | |
132 never be in an element's overrides field unless | |
133 w3-p-d-shortref-chars is also in the field.") | |
134 | |
135 (defvar w3-p-d-shortref-chars) | |
136 (put 'w3-p-d-shortref-chars 'variable-documentation | |
137 "A string of the characters which can start shortrefs in the current | |
138 between-tags region. This must be in a form which can be passed to | |
139 skip-chars-forward and must contain exactly the characters which start the | |
140 entries in w3-p-d-shortrefs. If this variable is mentioned in the | |
141 overrides field of an element, its handling is magical in that the | |
142 variable w3-p-d-non-markup-chars is saved to the element's undo-list and | |
143 updated at the same time. This should never be in an element's overrides | |
144 field unless w3-p-d-shortrefs is also in the field.") | |
145 | |
146 (defvar w3-p-d-tag-name) | |
147 (put 'w3-p-d-tag-name 'variable-documentation | |
148 "Name of tag we are looking at, as an Emacs Lisp symbol. | |
149 Only non-nil when we are looking at a tag.") | |
150 | |
151 (defvar w3-p-d-end-tag-p) | |
152 (put 'w3-p-d-end-tag-p 'variable-documentation | |
153 "Is the tag we are looking at an end tag? | |
154 Only non-nil when we are looking at a tag.") | |
155 | |
156 ) | |
157 | |
158 | |
159 ;;; | |
160 ;;; HTML syntax error messages. | |
161 ;;; | |
162 | |
163 (eval-when-compile | |
164 | |
165 (defvar w3-p-d-debug-url) | |
166 (put 'w3-p-d-debug-url 'variable-documentation | |
167 "Whether to print the URL being parsed before an error messages. | |
168 Only true for the first error message.") | |
169 | |
170 ;; The level parameter indicates whether the error is (1) very | |
171 ;; serious, must be displayed to all users, (2) invalid HTML, but the | |
172 ;; user should only be told if the user has indicated interest, or (3) | |
173 ;; valid HTML which is bad because it appears to rely on the way certain | |
174 ;; browsers will display it, which should only be displayed to the user | |
175 ;; if they have really asked for it. | |
176 | |
177 (defmacro w3-debug-html (&rest body) | |
178 "Emit a warning message. | |
179 These keywords may be used at the beginning of the arguments: | |
180 :mandatory-if sexp -- force printing if sexp evaluates non-nil. | |
181 :bad-style -- do not print unless w3-debug-html is 'style. | |
182 :outer -- do not include the current element in the element | |
183 context we report. | |
184 :nocontext -- do not include context where error detected. | |
185 The remaining parameters are treated as the body of a progn, the value of | |
186 which must be a string to use as the error message." | |
187 (let (mandatory-if bad-style outer nocontext condition) | |
188 (while (memq (car body) '(:mandatory-if :bad-style :outer :nocontext)) | |
189 (cond ((eq ':mandatory-if (car body)) | |
190 (setq mandatory-if (car (cdr body))) | |
191 (setq body (cdr (cdr body)))) | |
192 ((eq ':bad-style (car body)) | |
193 (setq bad-style t) | |
194 (setq body (cdr body))) | |
195 ((eq ':nocontext (car body)) | |
196 (setq nocontext t) | |
197 (setq body (cdr body))) | |
198 ((eq ':outer (car body)) | |
199 (setq outer t) | |
200 (setq body (cdr body))))) | |
201 (setq condition (if bad-style | |
202 '(eq 'style w3-debug-html) | |
203 'w3-debug-html)) | |
204 (if mandatory-if | |
205 (setq condition | |
206 (` (or (, mandatory-if) | |
207 (, condition))))) | |
208 (` (if (, condition) | |
209 (let ((message (progn (,@ body)))) | |
210 (if message | |
211 (w3-debug-html-aux message | |
212 (,@ (if nocontext | |
213 (list outer nocontext) | |
214 (if outer '(t))))))))))) | |
215 | |
216 ;; This is unsatisfactory. | |
217 (put 'w3-debug-html 'lisp-indent-function 0) | |
218 | |
219 (put 'w3-debug-html 'edebug-form-spec | |
220 '([&rest &or ":nocontext" ":outer" [":mandatory-if" form] ":bad-style"] | |
221 &rest form)) | |
222 ) | |
223 | |
224 (defun w3-debug-html-aux (message &optional outer nocontext) | |
225 (let ( | |
226 ;; We have already determined whether the user should see the | |
227 ;; message, so don't let w3-warn suppress it. | |
228 (w3-debug-html t)) | |
229 ;; Print the URL before the first error message for a document. | |
230 (cond (w3-p-d-debug-url | |
231 (let ((url (url-view-url t))) | |
232 (w3-warn 'html | |
233 (if (or (null url) | |
234 (string-equal "" url)) | |
235 (format "HTML errors for buffer %s" | |
236 (current-buffer)) | |
237 (format "HTML errors for <URL:%s>" url)))) | |
238 (setq w3-p-d-debug-url nil))) | |
239 (w3-warn 'html | |
240 (if nocontext | |
241 message | |
242 (concat message | |
243 ;; Display context information for each error | |
244 ;; message. | |
245 "\n Containing elements: " | |
246 (w3-open-elements-string (if outer 1)) | |
247 (concat | |
248 "\n Text around error: " | |
249 (save-restriction | |
250 (widen) | |
251 (progn | |
252 (insert "*ERROR*") | |
253 (prog1 | |
254 (w3-quote-for-string | |
255 (buffer-substring | |
256 (max (- (point) 27) (point-min)) | |
257 (min (+ (point) 20) (point-max)))) | |
258 (delete-char -7)))))))))) | |
259 | |
260 (defun w3-quote-for-string (string) | |
261 (save-excursion | |
262 (set-buffer (get-buffer-create " w3-quote-whitespace")) | |
263 (erase-buffer) | |
264 (insert string) | |
265 (goto-char (point-min)) | |
266 (insert "\"") | |
267 (while (progn | |
268 (skip-chars-forward "^\"\\\t\n\r") | |
269 (not (eobp))) | |
270 (insert "\\" (cdr (assq (following-char) '((?\" . "\"") | |
271 (?\\ . "\\") | |
272 (?\t . "t") | |
273 (?\n . "n") | |
274 (?\r . "r"))))) | |
275 (delete-char 1)) | |
276 (insert "\"") | |
277 (buffer-string))) | |
278 | |
279 | |
280 ;;; | |
281 ;;; General entity references and numeric character references. | |
282 ;;; | |
283 | |
284 ;; *** MULE conversion? | |
285 ;; *** I18N HTML support? | |
286 | |
287 (let ((html-entities w3-html-entities)) | |
288 (while html-entities | |
289 (put (car (car html-entities)) 'html-entity-expansion | |
290 (cons 'CDATA (if (integerp (cdr (car html-entities))) | |
291 (char-to-string (cdr (car html-entities))) | |
292 (cdr (car html-entities))))) | |
293 (setq html-entities (cdr html-entities)))) | |
294 | |
295 ;; These are handled differently than the normal HTML entities because | |
296 ;; we need to define the entities with 'nil instead of 'CDATA so | |
297 ;; that they are correctly scanned for new markup. | |
298 ;; | |
299 ;; from jbw@cs.bu.edu | |
300 ;; | |
301 ;;> Of course, this differs from the specification a bit. The W3C tech | |
302 ;;> report defines all of these as SYSTEM entities. This potentially means | |
303 ;;> that they can be used in more contexts. The method I outlined above | |
304 ;;> means "&smiley;" can only be used in contexts where IMG is a valid | |
305 ;;> element. I am not sure exactly where it is okay to use a SYSTEM entity. | |
306 ;;> I think anywhere that data characters are accepted. | |
307 ;; | |
308 ;; I find this acceptable, as just what the hell are you supposed to do with | |
309 ;; &computer; as part of a value of a form input when you display it and/or | |
310 ;; submit it?! | |
311 | |
312 (let ((html-entities w3-graphic-entities) | |
313 (cur nil)) | |
314 (while html-entities | |
315 (setq cur (car html-entities) | |
316 html-entities (cdr html-entities)) | |
317 (put (nth 0 cur) 'html-entity-expansion | |
318 (cons 'nil (format "img src=\"%s/%s%s\" alt=\"%s\"" | |
319 w3-icon-directory | |
320 (nth 1 cur) | |
321 (if w3-icon-format | |
322 (concat "." (symbol-name w3-icon-format)) | |
323 "") | |
324 (or (nth 3 cur) (nth 2 cur))))))) | |
325 | |
326 ;; These are the general entities in HTML 3.0 in terms of which the math | |
327 ;; shortrefs are defined: | |
328 ;; | |
329 ;; <!ENTITY REF1 STARTTAG "SUP"> | |
330 ;; <!ENTITY REF2 ENDTAG "SUP"> | |
331 ;; <!ENTITY REF3 STARTTAG "SUB"> | |
332 ;; <!ENTITY REF4 ENDTAG "SUB"> | |
333 ;; <!ENTITY REF5 STARTTAG "BOX"> | |
334 ;; <!ENTITY REF6 ENDTAG "BOX"> | |
335 ;; | |
336 ;; We're ignoring them because these names should really be local to the | |
337 ;; DTD and not visible in the document. They might change at any time in | |
338 ;; future HTML standards. | |
339 | |
340 ;; <!--Entities for language-dependent presentation (BIDI and contextual analysis) --> | |
341 ;; <!ENTITY zwnj CDATA "‌"-- zero width non-joiner--> | |
342 ;; <!ENTITY zwj CDATA "‍"-- zero width joiner--> | |
343 ;; <!ENTITY lrm CDATA "‎"-- left-to-right mark--> | |
344 ;; <!ENTITY rlm CDATA "‏"-- right-to-left mark--> | |
345 | |
346 ;; Entity names are case sensitive! | |
347 | |
348 ;; & should only be recognized when followed by letter or # and | |
349 ;; digit or # and letter. | |
350 | |
351 (eval-when-compile | |
352 | |
353 (w3-p-s-var-def w3-p-s-entity) | |
354 (w3-p-s-var-def w3-p-s-pos) | |
355 (w3-p-s-var-def w3-p-s-num) | |
356 ;; Destroys free variables: | |
357 ;; w3-p-s-entity, w3-p-s-pos, w3-p-s-num | |
358 ;; Depends on case-fold-search being t. | |
359 (defsubst w3-expand-entity-at-point-maybe () | |
360 ;; We are looking at a &. | |
361 ;; Only &A or  or &#A syntax is special. | |
362 (cond | |
363 ((and (looking-at "&\\([a-z][-a-z0-9.]*\\)[\ ;\n]?") ; \n should be \r | |
364 ;; We are looking at a general entity reference, maybe undefined. | |
365 (setq w3-p-s-entity | |
366 (get | |
367 (intern (buffer-substring (match-beginning 1) (match-end 1))) | |
368 'html-entity-expansion))) | |
369 | |
370 ;; If the reference was undefined, then for SGML, we should really | |
371 ;; issue a warning and delete the reference. However, the HTML | |
372 ;; standard (contradicting the SGML standard) says to leave the | |
373 ;; undefined reference in the text. | |
374 | |
375 ;; We are looking at a defined general entity reference. | |
376 (replace-match "") | |
377 (cond ((eq 'CDATA (car w3-p-s-entity)) | |
378 ;; Leave point after expansion so we don't rescan it. | |
379 (insert (cdr w3-p-s-entity))) | |
380 ((memq (car w3-p-s-entity) '(nil STARTTAG ENDTAG MS MD)) | |
381 ;; nil is how I mark ordinary entities. | |
382 ;; The replacement text gets rescanned for all of these. | |
383 (setq w3-p-s-pos (point)) | |
384 (insert (cdr (assq (car w3-p-s-entity) | |
385 '((nil . "") | |
386 (STARTTAG . "<") | |
387 (ENDTAG . "</") | |
388 (MS . "<![") | |
389 (MD . "<!")))) | |
390 (cdr w3-p-s-entity) | |
391 (cdr (assq (car w3-p-s-entity) | |
392 '((nil . "") | |
393 (STARTTAG . ">") | |
394 (ENDTAG . ">") | |
395 (MS . "]]>") | |
396 (MD . ">"))))) | |
397 (goto-char w3-p-s-pos) | |
398 ;; *** Strictly speaking, if we parse anything from the | |
399 ;; replacement text, it must end before the end of the | |
400 ;; replacement text. | |
401 ) | |
402 ((eq 'SDATA (car w3-p-s-entity)) | |
403 (insert "[Unimplemented SDATA \"%s\"]" (cdr w3-p-s-entity))) | |
404 ((eq 'PI (car w3-p-s-entity)) | |
405 ;; We are currently ignoring processing instructions. | |
406 ;; *** Strictly speaking, we should issue a warning if this | |
407 ;; occurs in a attribute value. | |
408 ) | |
409 (t | |
410 ;; *** We don't handle external entities yet. | |
411 (error "[Unimplemented entity: \"%s\"]" w3-p-s-entity)))) | |
412 | |
413 ((looking-at "&#[0-9][0-9]*\\([\ ;\n]?\\)") ; \n should be \r | |
414 ;; We are looking at a numeric character reference. | |
415 ;; Ensure the number is already terminated by a semicolon or carriage | |
416 ;; return so we can use "read" to get it as a number quickly. | |
417 (cond ((= (match-beginning 1) (match-end 1)) | |
418 ;; This is very uncommon, so we don't have to be quick here but | |
419 ;; rather correct. | |
420 (save-excursion | |
421 (goto-char (match-end 0)) ; same as match-end 1 | |
422 (insert ?\;)) | |
423 ;; Set up the match data properly | |
424 (looking-at "&#[0-9][0-9]*;"))) | |
425 (forward-char 2) | |
426 (setq w3-p-s-num (read (current-buffer))) | |
427 ;; Always leave point after the expansion of a numeric | |
428 ;; character reference, like it were a CDATA entity. | |
429 (replace-match "") | |
430 ;; char-to-string will hopefully do something useful with characters | |
431 ;; larger than 255. I think in MULE it does. Is this true? | |
432 ;; Bill wants to call w3-resolve-numeric-entity here, but I think | |
433 ;; that functionality belongs in char-to-string. | |
434 ;; The largest valid character in the I18N version of HTML is 65533. | |
435 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt> | |
436 (insert (char-to-string w3-p-s-num))) | |
437 | |
438 ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r | |
439 (replace-match (assq (upcase (char-after (+ 3 (point)))) | |
440 '(;; *** Strictly speaking, record end should be | |
441 ;; carriage return. | |
442 (?E . "\n") ; RE | |
443 ;; *** And record start should be line feed. | |
444 (?S . "") ; RS | |
445 (?P . " ") ; SPACE | |
446 (?A . "\t")))) ; TAB | |
447 ;; Leave point after the expansion of a character reference, so it | |
448 ;; doesn't get rescanned. | |
449 ;; *** Strictly speaking, we should issue a warning for &#foo; if foo | |
450 ;; is not a function character in the SGML declaration. | |
451 ) | |
452 | |
453 ((eq ?& (following-char)) | |
454 ;; We are either looking at an undefined reference or a & that does | |
455 ;; not start a reference (in which case we should not have been called). | |
456 ;; Skip over the &. | |
457 (forward-char 1)) | |
458 | |
459 (t | |
460 ;; What is the code doing calling us if we're not looking at a "&"? | |
461 (error "this should never happen")))) | |
462 | |
463 ) | |
464 | |
465 | |
466 ;;; | |
467 ;;; Syntax table used in markup declarations. | |
468 ;;; | |
469 | |
470 (defvar w3-sgml-md-syntax-table | |
471 (let ((table (make-syntax-table)) | |
472 (items '( | |
473 (0 "." 255) ; clear everything | |
474 (?\r " ") | |
475 (?\t " ") | |
476 (?\n " ") | |
477 (32 " ") ; space | |
478 (?< "\(>") | |
479 (?> "\)<") | |
480 (?\( "\(\)") | |
481 (?\) "\)\(") | |
482 (?\[ "\(\]") | |
483 (?\] "\)\[") | |
484 (?\" "\"") | |
485 (?\' "\"") | |
486 (?a "w" ?z) | |
487 (?A "w" ?Z) | |
488 (?0 "w" ?9) | |
489 (?. "w") | |
490 ;; "-" can be a character in a NAME, but it is also used in | |
491 ;; "--" as both a comment start and end within SGML | |
492 ;; declarations ("<!" ... ">"). In HTML, it is only used | |
493 ;; as a NAME character in the parameter entities | |
494 ;; Content-Type, HTTP-Method, and style-notations and in | |
495 ;; the attribute name http-equiv and in the notation names | |
496 ;; dsssl-lite and w3c-style. We would like to be able to | |
497 ;; train Emacs to skip over these kinds of comments with | |
498 ;; forward-sexp and backward-sexp. Is there any way to | |
499 ;; teach Emacs how to do this? It doesn't seem to be the | |
500 ;; case. | |
501 (?- "w") | |
502 ))) | |
503 (while items | |
504 (let* ((item (car items)) | |
505 (char (car item)) | |
506 (syntax (car (cdr item))) | |
507 (bound (or (car-safe (cdr-safe (cdr item))) | |
508 char))) | |
509 (while (<= char bound) | |
510 (modify-syntax-entry char syntax table) | |
511 (setq char (1+ char)))) | |
512 (setq items (cdr items))) | |
513 table) | |
514 "A syntax table for parsing SGML markup declarations.") | |
515 | |
516 | |
517 ;;; | |
518 ;;; Element information data type. | |
519 ;;; | |
520 | |
521 ;; The element information data type is used in two ways: | |
522 ;; | |
523 ;; * To store the DTD, there is one element record for each element in | |
524 ;; the DTD. | |
525 ;; | |
526 ;; * To store information for open elements in the current parse tree. | |
527 ;; Each such element is initialized by copying the element record | |
528 ;; from the DTD. This means that values in the fields can not be | |
529 ;; destructively altered, although of course the fields can be | |
530 ;; changed. | |
531 | |
532 ;; The cells in this vector are: | |
533 ;; | |
534 ;; name: the element's name (a generic identifier). | |
535 ;; | |
536 ;; end-tag-name: a symbol whose name should be the result of prefixing | |
537 ;; the generic-identifier with a slash. This is a convenience value for | |
538 ;; interfacing with the display engine which expects a stream of start | |
539 ;; and end tags in this format rather than a tree. | |
540 ;; | |
541 ;; content-model: a data structure describing what elements or character | |
542 ;; data we expect to find within this element. This is either a symbol | |
543 ;; listed here: | |
544 ;; | |
545 ;; EMPTY: no content, no end-tag allowed. | |
546 ;; CDATA: all data characters until "</[a-z]" is seen. | |
547 ;; XCDATA: special non-SGML-standard mode which includes all data | |
548 ;; characters until "</foo" is seen where "foo" is the name of this | |
549 ;; element. (for XMP and LISTING) | |
550 ;; XXCDATA: special non-SGML-standard mode which includes all data | |
551 ;; until end-of-entity (end-of-buffer for us). (for PLAINTEXT) | |
552 ;; RCDATA: all data characters until "</[a-z]" is seen, except that | |
553 ;; entities are expanded first, although the expansions are not | |
554 ;; scanned for end-tags. | |
555 ;; XINHERIT: special non-SGML-standard mode which means to use the | |
556 ;; content model of the containing element instead. | |
557 ;; | |
558 ;; or a vector of this structure: | |
559 ;; | |
560 ;; [(INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT) ...] | |
561 ;; | |
562 ;; where INCLUDES is of the format: | |
563 ;; | |
564 ;; (TAG ...) | |
565 ;; | |
566 ;; where each TRANSITION is one of these: | |
567 ;; | |
568 ;; (ACTION NEW-STATE ERRORP) | |
569 ;; (ACTION NEW-STATE) | |
570 ;; (ACTION) | |
571 ;; | |
572 ;; where DEFAULT is one of these: | |
573 ;; | |
574 ;; nil or TRANSITION | |
575 ;; | |
576 ;; where the meaning of the components is: | |
577 ;; | |
578 ;; INCLUDES is a list of tags for which the transition (*include *same | |
579 ;; nil) applies. | |
580 ;; | |
581 ;; DEFAULT if non-nil is a transition that should be taken when | |
582 ;; matching any possibility not explicitly listed in another | |
583 ;; TRANSITION, except for data characters containing only whitespace. | |
584 ;; | |
585 ;; INCSPACEP specifies how to handle data characters which include | |
586 ;; only whitespace characters. The value is non-nil to indicate | |
587 ;; (*include *same nil) or nil to indicate (*discard *same nil). | |
588 ;; | |
589 ;; TAG is a symbol corresponding to the start-tag we are looking at, | |
590 ;; or *data when seeing character data that includes at least one | |
591 ;; non-space character. | |
592 ;; | |
593 ;; ACTION is one of: | |
594 ;; *close: Close this element and try again using content model of | |
595 ;; enclosing element. (Note that this does not apply to the | |
596 ;; case of an element being closed by its own end-tag.) | |
597 ;; *include: Process new element as subelement of this one or | |
598 ;; include data characters directly. | |
599 ;; *discard: Discard a start-tag or data characters. | |
600 ;; *retry: Try again after processing NEW-STATE and ERRORP. | |
601 ;; ELEMENT: Open ELEMENT (with default attributes), then try again | |
602 ;; using its content model. | |
603 ;; | |
604 ;; NEW-STATE (optional, default *same) is the index of the state to | |
605 ;; move to after processing the element or one of these: | |
606 ;; *same: no state change occurs. | |
607 ;; *next: change the the current state + 1. | |
608 ;; The initial state is 0. NEW-STATE does not matter if ACTION is | |
609 ;; *close. | |
610 ;; | |
611 ;; ERRORP (optional, default nil) if non-nil indicates this transition | |
612 ;; represents an error. The error message includes this value if it | |
613 ;; is a string. | |
614 ;; | |
615 ;; If no matching transition is found, the default transition is | |
616 ;; (*discard *same "not allowed here"). | |
617 ;; | |
618 ;; overrides: An alist of pairs of the form (VAR REPLACEP . VALUE). | |
619 ;; When this element is opened, the old value of VAR is saved in the | |
620 ;; undo-list. If REPLACEP is non-nil, then VAR gets value VALUE, | |
621 ;; otherwise VAR gets value (append VALUE (symbol-value VAR)). Useful | |
622 ;; values for VAR are: | |
623 ;; | |
624 ;; w3-p-d-exceptions: See doc string. | |
625 ;; | |
626 ;; w3-p-d-shortrefs: See doc string. | |
627 ;; | |
628 ;; w3-p-d-shortref-chars: See doc string. | |
629 ;; | |
630 ;; end-tag-omissible: Whether it is legal to omit the end-tag of this | |
631 ;; element. If an end-tag is inferred for an element whose end tag is | |
632 ;; not omissible, an error message is given. | |
633 ;; | |
634 ;; state: The current state in the content model. Preset to the initial | |
635 ;; state of 0. | |
636 ;; | |
637 ;; undo-list: an alist of of former values of local variables | |
638 ;; of w3-parse-buffer to restore upon closing this element. Each | |
639 ;; item on the list is of the format (VAR . VALUE-TO-RESTORE). | |
640 ;; | |
641 ;; attributes: an alist of attributes and values. Each item on | |
642 ;; this list is of the format (ATTRIBUTE-NAME . VALUE). Each | |
643 ;; ATTRIBUTE-NAME is a symbol and each attribute value is a | |
644 ;; string. | |
645 ;; | |
646 ;; content: a list of the accumulated content of the element. While the | |
647 ;; element is open, the list is in order from latest to earliest, | |
648 ;; otherwise it is in order from earliest to latest. Each member is | |
649 ;; either a string of data characters or a list of the form (NAME | |
650 ;; ATTRIBUTES CONTENT), where NAME is the subelement's name, ATTRIBUTES | |
651 ;; is an alist of the subelement's attribute names (lowercase symbols) | |
652 ;; and their values (strings), and CONTENT is the subelement's content. | |
653 | |
654 (eval-when-compile | |
655 | |
656 (defconst w3-element-fields | |
657 '(name end-tag-name content-model state overrides undo-list | |
658 content attributes end-tag-omissible deprecated)) | |
659 | |
660 (let* ((fields w3-element-fields) | |
661 (index (1- (length fields)))) | |
662 (while fields | |
663 (let* ((field (symbol-name (car fields))) | |
664 (get-sym (intern (concat "w3-element-" field))) | |
665 (set-sym (intern (concat "w3-set-element-" field)))) | |
666 (eval (` (progn | |
667 (defmacro (, get-sym) (element) | |
668 (list 'aref element (, index))) | |
669 (defmacro (, set-sym) (element value) | |
670 (list 'aset element (, index) value)))))) | |
671 (setq fields (cdr fields)) | |
672 (setq index (1- index)))) | |
673 | |
674 (defmacro w3-make-element () | |
675 (list 'make-vector (length w3-element-fields) nil)) | |
676 | |
677 ;; *** move this to be with DTD declaration. | |
678 (defmacro w3-fresh-element-for-tag (tag) | |
679 (` (copy-sequence | |
680 (or (get (, tag) 'html-element-info) | |
681 (error "unimplemented element %s" | |
682 (w3-sgml-name-to-string (, tag))))))) | |
683 | |
684 ;; *** move this to be with DTD declaration. | |
685 (defmacro w3-known-element-p (tag) | |
686 (` (get (, tag) 'html-element-info))) | |
687 | |
688 (defsubst w3-sgml-name-to-string (sym) | |
689 (upcase (symbol-name sym))) | |
690 | |
691 ) | |
692 | |
693 | |
694 ;;; | |
695 ;;; Parse tree manipulation. | |
696 ;;; | |
697 | |
698 ;; ;; Find the name of the previous element or a substring of the | |
699 ;; ;; preceding data characters. | |
700 ;; (let ((content (w3-element-content (car stack)))) | |
701 ;; (while content | |
702 ;; (cond | |
703 ;; ((and (stringp (car content)) | |
704 ;; (not (string-match "\\`[ \t\n\r]*\\'" (car content)))) | |
705 ;; (setq prior-item (car content)) | |
706 ;; ;; Trim trailing whitespace | |
707 ;; (if (string-match "\\(.*[^ \t\n\r]\\)[ \t\n\r]*\\'" prior-item) | |
708 ;; (setq prior-item (substring prior-item 0 (match-end 1)))) | |
709 ;; (if (> (length prior-item) 8) | |
710 ;; (setq prior-item (concat "..." (substring prior-item -8)))) | |
711 ;; (setq prior-item (w3-quote-for-string prior-item)) | |
712 ;; (setq prior-item (concat "\(after " prior-item "\)")) | |
713 ;; (setq content nil)) | |
714 ;; ((and (consp (car content)) | |
715 ;; (symbolp (car (car content)))) | |
716 ;; (setq prior-item | |
717 ;; (concat "\(after " | |
718 ;; (w3-sgml-name-to-string (car (car content))) | |
719 ;; "\)")) | |
720 ;; (setq content nil)) | |
721 ;; (t | |
722 ;; (setq content (cdr content)))))) | |
723 | |
724 ;; Only used for HTML debugging. | |
725 (defun w3-open-elements-string (&optional skip-count) | |
726 (let* ((stack (nthcdr (or skip-count 0) | |
727 (cons w3-p-d-current-element | |
728 w3-p-d-open-element-stack))) | |
729 ;;(prior-item "(at start)") | |
730 result) | |
731 ;; Accumulate the names of the enclosing elements. | |
732 (while stack | |
733 (let ((element (w3-element-name (car stack)))) | |
734 (if (eq '*holder element) | |
735 nil | |
736 ;; Only include *DOCUMENT if there are no other elements. | |
737 (if (or (not (eq '*document element)) | |
738 (null result)) | |
739 (setq result (cons (w3-sgml-name-to-string element) | |
740 result))))) | |
741 (setq stack (cdr stack))) | |
742 (setq result (mapconcat 'identity result ":")) | |
743 (if result | |
744 ;;(concat | |
745 result | |
746 ;; prior-item) | |
747 "[nowhere!]"))) | |
748 | |
749 ;; *** This doesn't really belong here, but where? | |
750 (eval-when-compile | |
751 (defmacro w3-invalid-sgml-chars () | |
752 "Characters not allowed in an SGML document using the reference | |
753 concrete syntax (i.e. HTML). Returns a string in the format expected by | |
754 skip-chars-forward." | |
755 "\000-\010\013\014\016-\037\177-\237")) | |
756 | |
757 (eval-when-compile | |
758 ;; Uses: | |
759 ;; w3-p-d-null-end-tag-enabled, w3-p-d-in-parsed-marked-section, | |
760 ;; w3-p-d-shortref-chars | |
761 ;; Modifies free variable: | |
762 ;; w3-p-d-non-markup-chars | |
763 (defsubst w3-update-non-markup-chars () | |
764 (setq w3-p-d-non-markup-chars | |
765 (concat "^&<" | |
766 (w3-invalid-sgml-chars) | |
767 (if w3-p-d-null-end-tag-enabled "/" "") | |
768 (if w3-p-d-in-parsed-marked-section "]" "") | |
769 (or w3-p-d-shortref-chars "")))) | |
770 | |
771 ;; Modifies free variable: | |
772 ;; w3-p-d-parse-tag-stream-tail-pointer | |
773 (defsubst w3-add-display-item (tag value) | |
774 (setcdr w3-p-d-parse-tag-stream-tail-pointer | |
775 (list (cons tag value))) | |
776 (setq w3-p-d-parse-tag-stream-tail-pointer | |
777 (cdr w3-p-d-parse-tag-stream-tail-pointer))) | |
778 | |
779 ) | |
780 | |
781 (eval-when-compile | |
782 (w3-p-s-var-def w3-p-s-overrides) | |
783 (w3-p-s-var-def w3-p-s-undo-list) | |
784 (w3-p-s-var-def w3-p-s-var) | |
785 ;; Uses free variables: | |
786 ;; w3-p-d-non-markup-chars | |
787 ;; Modifies free variables: | |
788 ;; w3-p-d-current-element, w3-p-d-open-element-stack | |
789 ;; Destroys free variables: | |
790 ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var | |
791 (defsubst w3-open-element (tag attributes) | |
792 | |
793 ;; Send trailing data character item in the old current element to | |
794 ;; display engine. | |
795 (if (stringp (car-safe (w3-element-content w3-p-d-current-element))) | |
796 (w3-add-display-item | |
797 'text | |
798 (car-safe (w3-element-content w3-p-d-current-element)))) | |
799 | |
800 ;; Push new element on stack. | |
801 (setq w3-p-d-open-element-stack (cons w3-p-d-current-element | |
802 w3-p-d-open-element-stack)) | |
803 (setq w3-p-d-current-element (w3-fresh-element-for-tag tag)) | |
804 | |
805 ;; Warn if deprecated or obsolete. | |
806 (if (w3-element-deprecated w3-p-d-current-element) | |
807 (w3-debug-html :outer | |
808 (format "%s element %s." | |
809 (if (eq 'obsolete | |
810 (w3-element-deprecated w3-p-d-current-element)) | |
811 "Obsolete" | |
812 "Deprecated") | |
813 (w3-sgml-name-to-string | |
814 (w3-element-name w3-p-d-current-element))))) | |
815 | |
816 ;; Store attributes. | |
817 ;; *** we are not handling #CURRENT attributes (HTML has none). | |
818 (w3-set-element-attributes w3-p-d-current-element attributes) | |
819 ;; *** Handle default attribute values. | |
820 ;; *** Fix the attribute name for unnamed values. Right now they will | |
821 ;; be in the attribute list as items of the format (VALUE . VALUE) where | |
822 ;; both occurrences of VALUE are the same. The first one needs to be | |
823 ;; changed to the proper attribute name by consulting the DTD. | |
824 ;; ******************** | |
825 | |
826 ;; Handle syntax/semantics overrides of new current element. | |
827 (cond ((w3-element-overrides w3-p-d-current-element) | |
828 (setq w3-p-s-overrides | |
829 (w3-element-overrides w3-p-d-current-element)) | |
830 (setq w3-p-s-undo-list nil) | |
831 (while w3-p-s-overrides | |
832 (setq w3-p-s-var (car (car w3-p-s-overrides))) | |
833 (setq w3-p-s-undo-list | |
834 (cons (cons w3-p-s-var | |
835 (symbol-value w3-p-s-var)) | |
836 w3-p-s-undo-list)) | |
837 (set w3-p-s-var (if (car (cdr (car w3-p-s-overrides))) | |
838 (cdr (cdr (car w3-p-s-overrides))) | |
839 (append (cdr (cdr (car w3-p-s-overrides))) | |
840 (symbol-value w3-p-s-var)))) | |
841 ;; *** HACK HACK. | |
842 ;; Magic handling of w3-p-d-shortref-chars. | |
843 (cond ((eq 'w3-p-d-shortref-chars w3-p-s-var) | |
844 (setq w3-p-s-undo-list | |
845 (cons (cons 'w3-p-d-non-markup-chars | |
846 w3-p-d-non-markup-chars) | |
847 w3-p-s-undo-list)) | |
848 (w3-update-non-markup-chars))) | |
849 (setq w3-p-s-overrides (cdr w3-p-s-overrides))) | |
850 (w3-set-element-undo-list w3-p-d-current-element | |
851 w3-p-s-undo-list))) | |
852 | |
853 ;; Handle content-model inheritance. (Very non-SGML!) | |
854 (if (eq 'XINHERIT (w3-element-content-model w3-p-d-current-element)) | |
855 (w3-set-element-content-model | |
856 w3-p-d-current-element | |
857 (w3-element-content-model (car w3-p-d-open-element-stack)))) | |
858 | |
859 ;; Send the start-tag and attributes to the display engine. | |
860 (if (memq tag '(plaintext style xmp textarea)) | |
861 ;; Garbage special-casing for old display engine. | |
862 ;; Nothing is sent until end-tag is found. | |
863 ;; The DTD will ensure no subelements of these elements. | |
864 nil | |
865 ;; Normal procedure. | |
866 (w3-add-display-item tag attributes))) | |
867 ) | |
868 | |
869 ;; The protocol for handing items to the display engine is as follows. | |
870 ;; | |
871 ;; For an element, send (START-TAG . ATTS), each member of the content, | |
872 ;; and (END-TAG . nil) if the element is allowed to have an end tag. | |
873 ;; | |
874 ;; For data characters, send (text . DATA-CHARACTERS). | |
875 ;; | |
876 ;; Exceptions: | |
877 ;; | |
878 ;; For PLAINTEXT, STYLE, XMP, TEXTAREA send: | |
879 ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTS)). | |
880 ;; | |
881 ;; *** This requires somehow eliminating any subelements of the TEXTAREA | |
882 ;; element. TEXTAREA can contain subelements in HTML 3.0. | |
883 ;; | |
884 ;; For LISTING, send (text . DATA-CHARACTERS). (Is this really correct or | |
885 ;; is this perhaps a bug in the old parser?) I'm ignoring this for now. | |
886 | |
887 (eval-when-compile | |
888 (w3-p-s-var-def w3-p-s-undo-list) | |
889 (w3-p-s-var-def w3-p-s-content) | |
890 (w3-p-s-var-def w3-p-s-end-tag) | |
891 ;; Modifies free variables: | |
892 ;; w3-p-d-current-element, w3-p-d-open-element-stack | |
893 ;; Accesses free variables: | |
894 ;; w3-p-d-tag-name, w3-p-d-end-tag-p | |
895 ;; Destroys free variables: | |
896 ;; w3-p-s-undo-list, w3-p-s-content, w3-p-s-end-tag | |
897 (defsubst w3-close-element (&optional inferred) | |
898 ;; inferred: non-nil if the end-tag of the current element is being | |
899 ;; inferred due to the presence of content not allowed in the current | |
900 ;; element. If t, then the tag causing this is in w3-p-d-tag-name and | |
901 ;; w3-p-d-end-tag-p. | |
902 ;; (OLD: ... otherwise it is a symbol indicating the start-tag | |
903 ;; of an element or *data or *space indicating data characters.) | |
904 | |
905 (cond ((and inferred | |
906 (not (w3-element-end-tag-omissible w3-p-d-current-element))) | |
907 (w3-debug-html | |
908 (format "</%s> end-tag not omissible (required due to %s)" | |
909 (w3-sgml-name-to-string | |
910 (w3-element-name w3-p-d-current-element)) | |
911 (cond ((eq t inferred) | |
912 (format (if w3-p-d-end-tag-p | |
913 "</%s> end-tag" | |
914 "start-tag for %s") | |
915 (w3-sgml-name-to-string | |
916 w3-p-d-tag-name))) | |
917 ;; *** Delete this functionality? | |
918 ((memq inferred '(*space *data)) | |
919 "data characters") | |
920 ((symbolp inferred) | |
921 (format "start-tag for %s" | |
922 (w3-sgml-name-to-string inferred))) | |
923 ))))) | |
924 | |
925 ;; Undo any variable bindings of this element. | |
926 (cond ((w3-element-undo-list w3-p-d-current-element) | |
927 (setq w3-p-s-undo-list | |
928 (w3-element-undo-list w3-p-d-current-element)) | |
929 (while w3-p-s-undo-list | |
930 (set (car (car w3-p-s-undo-list)) | |
931 (cdr (car w3-p-s-undo-list))) | |
932 (setq w3-p-s-undo-list (cdr w3-p-s-undo-list))))) | |
933 | |
934 (setq w3-p-s-end-tag | |
935 (w3-element-end-tag-name w3-p-d-current-element)) | |
936 | |
937 ;; Fix up the content of the current element in preparation for putting | |
938 ;; it in the parent. | |
939 ;; Remove trailing newline from content, if there is one, otherwise send | |
940 ;; any trailing data character item to display engine. | |
941 (setq w3-p-s-content (w3-element-content w3-p-d-current-element)) | |
942 (cond ((null w3-p-s-content)) | |
943 ((equal "\n" (car w3-p-s-content)) | |
944 (setq w3-p-s-content (cdr w3-p-s-content))) | |
945 ((and (stringp (car w3-p-s-content)) | |
946 ;; Garbage special-casing for old display engine. | |
947 (not (memq w3-p-s-end-tag | |
948 '(/plaintext /style /xmp /textarea)))) | |
949 (w3-add-display-item 'text (car w3-p-s-content)))) | |
950 | |
951 ;; Send the end-tag to the display engine, but only if the element is | |
952 ;; allowed to have an end tag. | |
953 (cond ((memq w3-p-s-end-tag '(/plaintext /style /xmp /textarea)) | |
954 ;; Garbage special-casing for old display engine. | |
955 ;; Format old display engine expects for these elements: | |
956 ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTRIBUTES)) | |
957 (w3-add-display-item | |
958 ;; Use the *start*-tag, not the end-tag. | |
959 (w3-element-name w3-p-d-current-element) | |
960 (cons (cons 'data | |
961 (condition-case nil | |
962 (mapconcat 'identity w3-p-s-content "") | |
963 (error "eeek! subelement content!"))) | |
964 (w3-element-attributes w3-p-d-current-element)))) | |
965 ;; *** Handle LISTING the way the old parser did. | |
966 ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element)) | |
967 ;; Do nothing, can't have an end tag. | |
968 ) | |
969 (t | |
970 ;; Normal case. | |
971 (w3-add-display-item w3-p-s-end-tag nil) | |
972 (if (null w3-p-s-content) | |
973 (w3-debug-html | |
974 :bad-style :outer | |
975 ;; Don't warn for empty TD elements or empty A elements | |
976 ;; with no HREF attribute. | |
977 ;; *** Crude hack that should really be encoded in the | |
978 ;; element database somehow. | |
979 (if (or (not (memq (w3-element-name w3-p-d-current-element) | |
980 '(a td))) | |
981 (assq 'href | |
982 (w3-element-attributes w3-p-d-current-element))) | |
983 (format "Empty %s element." | |
984 (w3-sgml-name-to-string | |
985 (w3-element-name w3-p-d-current-element)))))))) | |
986 | |
987 ;; Put the current element in the proper place in its parent. | |
988 ;; This will cause an error if we overpop the stack. | |
989 (w3-set-element-content | |
990 (car w3-p-d-open-element-stack) | |
991 (cons (list (w3-element-name w3-p-d-current-element) | |
992 (w3-element-attributes w3-p-d-current-element) | |
993 (nreverse w3-p-s-content)) | |
994 (w3-element-content (car w3-p-d-open-element-stack)))) | |
995 | |
996 ;; Pop the stack. | |
997 (setq w3-p-d-current-element (car w3-p-d-open-element-stack)) | |
998 (setq w3-p-d-open-element-stack (cdr w3-p-d-open-element-stack))) | |
999 | |
1000 ) | |
1001 | |
1002 | |
1003 ;;; | |
1004 ;;; A pseudo-DTD for HTML. | |
1005 ;;; | |
1006 | |
1007 (eval-when-compile | |
1008 ;; This works around the following bogus compiler complaint: | |
1009 ;; While compiling the end of the data in file w3-parse.el: | |
1010 ;; ** the function w3-expand-parameters is not known to be defined. | |
1011 ;; This is a bogus error. Anything of this form will trigger this message: | |
1012 ;; (eval-when-compile (defun xyzzy () (xyzzy))) | |
1013 (defun w3-expand-parameters (pars data) nil)) | |
1014 | |
1015 (eval-when-compile | |
1016 (defun w3-expand-parameters (pars data) | |
1017 (cond ((null data) | |
1018 nil) | |
1019 ((consp data) | |
1020 ;; This has to be written carefully to avoid exceeding the | |
1021 ;; maximum lisp function call nesting depth. | |
1022 (let (result) | |
1023 (while (consp data) | |
1024 (let ((car-exp (w3-expand-parameters pars (car data)))) | |
1025 (setq result | |
1026 (if (and (symbolp (car data)) | |
1027 (not (eq car-exp (car data))) | |
1028 ;; An expansion occurred. | |
1029 (listp car-exp)) | |
1030 ;; The expansion was a list, which we splice in. | |
1031 (condition-case err | |
1032 (append (reverse car-exp) result) | |
1033 (wrong-type-argument | |
1034 (if (eq 'listp (nth 1 err)) | |
1035 ;; Wasn't really a "list" since the last | |
1036 ;; cdr wasn't nil, so don't try to splice | |
1037 ;; it in. | |
1038 (cons car-exp result) | |
1039 (signal (car err) (cdr err))))) | |
1040 (cons car-exp result)))) | |
1041 (setq data (cdr data))) | |
1042 (append (nreverse result) | |
1043 (w3-expand-parameters pars data)))) | |
1044 ((symbolp data) | |
1045 (let ((sym-exp (cdr-safe (assq data pars)))) | |
1046 (if sym-exp | |
1047 (w3-expand-parameters pars sym-exp) | |
1048 data))) | |
1049 ((vectorp data) | |
1050 (let ((i 0) | |
1051 (result (copy-sequence data))) | |
1052 (while (< i (length data)) | |
1053 (aset result i | |
1054 (w3-expand-parameters pars (aref data i))) | |
1055 (setq i (1+ i))) | |
1056 result)) | |
1057 (t | |
1058 data)))) | |
1059 | |
1060 (eval-when-compile | |
1061 (defun w3-unfold-dtd (items) | |
1062 (let (result) | |
1063 (while items | |
1064 (let* ((item (car items)) | |
1065 (names (car item)) | |
1066 (content-model | |
1067 (or (cdr-safe (assq 'content-model item)) | |
1068 (error "impossible"))) | |
1069 (overrides (cdr-safe (assq 'overrides item))) | |
1070 (end-tag-omissible | |
1071 (or (cdr-safe (assq 'end-tag-omissible item)) | |
1072 ;; *** Is this SGML standard? | |
1073 (eq 'EMPTY content-model))) | |
1074 (deprecated (cdr-safe (assq 'deprecated item))) | |
1075 element | |
1076 name) | |
1077 (while names | |
1078 (setq name (car names)) | |
1079 (setq names (cdr names)) | |
1080 | |
1081 ;; Create and initialize the element information data | |
1082 ;; structure. | |
1083 (setq element (w3-make-element)) | |
1084 (w3-set-element-name element name) | |
1085 (w3-set-element-end-tag-name | |
1086 element | |
1087 (intern (concat "/" (symbol-name name)))) | |
1088 (w3-set-element-state element 0) | |
1089 (w3-set-element-content-model element content-model) | |
1090 (w3-set-element-end-tag-omissible element end-tag-omissible) | |
1091 | |
1092 (or (memq deprecated '(nil t obsolete)) | |
1093 (error "impossible")) | |
1094 (w3-set-element-deprecated element deprecated) | |
1095 | |
1096 ;; Inclusions and exclusions are specified differently in the | |
1097 ;; human-coded DTD than in the format the implementation uses. | |
1098 ;; The human-coded version is designed to be easy to edit and to | |
1099 ;; work with w3-expand-parameters while the internal version is | |
1100 ;; designed to be fast. We have to translate here. This work | |
1101 ;; is repeated for every element listed in `names' so that the | |
1102 ;; exclusion exception error messages can be accurate. | |
1103 (let ((inclusions (cdr-safe (assq 'inclusions item))) | |
1104 (exclusions (cdr-safe (assq 'exclusions item))) | |
1105 (exclusion-mode '*close) | |
1106 (exclusion-message | |
1107 (format "%s exclusion" (w3-sgml-name-to-string name))) | |
1108 exceptions) | |
1109 (while inclusions | |
1110 (setq exceptions (cons (cons (car inclusions) | |
1111 '(*include *same nil)) | |
1112 exceptions)) | |
1113 (setq inclusions (cdr inclusions))) | |
1114 (while exclusions | |
1115 (cond ((memq (car exclusions) '(*discard *include *close)) | |
1116 (setq exclusion-mode (car exclusions))) | |
1117 ((stringp (car exclusions)) | |
1118 (setq exclusion-message (car exclusions))) | |
1119 (t | |
1120 (setq exceptions (cons (list (car exclusions) | |
1121 exclusion-mode | |
1122 '*same | |
1123 exclusion-message) | |
1124 exceptions)))) | |
1125 (setq exclusions (cdr exclusions))) | |
1126 (let ((overrides (if exceptions | |
1127 (cons (cons 'w3-p-d-exceptions | |
1128 (cons nil exceptions)) | |
1129 overrides) | |
1130 overrides))) | |
1131 (w3-set-element-overrides element overrides))) | |
1132 | |
1133 (setq result (cons (cons name element) result)))) | |
1134 (setq items (cdr items))) | |
1135 result))) | |
1136 | |
1137 ;; Load the HTML DTD. | |
1138 ;; <URL:ftp://ds.internic.net/rfc/rfc1866.txt> | |
1139 ;; *** Be sure to incorporate rfc1867 when attribute-checking is added. | |
1140 ;; *** Write function to check sanity of the content-model forms. | |
1141 ;; *** I18N: Add Q, BDO, SPAN | |
1142 (mapcar | |
1143 (function | |
1144 (lambda (pair) | |
1145 (put (car pair) 'html-element-info (cdr pair)))) | |
1146 ;; The purpose of this complexity is to speed up loading by | |
1147 ;; pre-evaluating as much as possible at compile time. | |
1148 (eval-when-compile | |
1149 (w3-unfold-dtd | |
1150 (w3-expand-parameters | |
1151 '( | |
1152 (%headempty . (link base meta range)) | |
1153 (%head-deprecated . (nextid)) | |
1154 | |
1155 ;; client-side imagemaps | |
1156 (%imagemaps . (area map)) | |
1157 ;; special action is taken for %text inside %body.content in the | |
1158 ;; content model of each element. | |
1159 (%body.content . (%heading %block hr div address %imagemaps)) | |
1160 | |
1161 (%heading . (h1 h2 h3 h4 h5 h6)) | |
1162 | |
1163 ;; Netscape's CENTER, FONT, and BASEFONT are handled | |
1164 ;; non-standardly. In actual psuedo-HTML on the net, these are | |
1165 ;; used as both text-level constructs and block-level constructs. | |
1166 ;; They are the only items in both %block and %text in this | |
1167 ;; definition here. | |
1168 ;; *** Perhaps add BLINK here too? But no one uses that as a block | |
1169 ;; construct. What about NOBR? | |
1170 (%netscape-crud . (center font basefont)) | |
1171 | |
1172 ;; Emacs-w3 extensions | |
1173 (%emacsw3-crud-nonempty . (roach secret wired)) | |
1174 (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek)) | |
1175 | |
1176 (%block . (p %list dl form %preformatted %netscape-crud font | |
1177 %blockquote isindex fn table fig note | |
1178 %block-deprecated %block-obsoleted)) | |
1179 (%list . (ul ol)) | |
1180 (%preformatted . (pre)) | |
1181 (%blockquote . (bq)) | |
1182 (%block-deprecated . (dir menu blockquote)) | |
1183 (%block-obsoleted . (xmp listing)) | |
1184 | |
1185 ;; Why is IMG in this list? | |
1186 (%pre.exclusion . (*include img *discard tab math big small sub sup)) | |
1187 | |
1188 (%text . (*data b %notmath %netscape-crud sub sup | |
1189 %emacsw3-crud %emacsw3-crud-nonempty)) | |
1190 (%notmath . (%special %font %phrase %misc)) | |
1191 (%font . (i u s strike tt big small)) ; B left out for MATH handling | |
1192 (%phrase . (em strong code samp kbd var cite blink)) | |
1193 ;; Don't know if this is right place for EMBED. | |
1194 (%special . (a img br wbr nobr tab math embed)) | |
1195 (%misc . (q lang au dfn person acronym abbrev ins del)) | |
1196 | |
1197 (%formula . (*data %math)) | |
1198 (%math . (box above below %mathvec root sqrt array sub sup | |
1199 %mathface)) | |
1200 (%mathvec . (vec bar dot ddot hat tilde)) | |
1201 (%mathface . (b t bt)) | |
1202 | |
1203 (%mathdelims . (over atop choose left right of)) | |
1204 | |
1205 ;; What the hell? This takes BODYTEXT????? No way! | |
1206 (%bq-content-model . [(nil | |
1207 nil | |
1208 (((bodytext) *include *next)) | |
1209 (bodytext *next)) | |
1210 (nil | |
1211 nil | |
1212 (((credit) *include *next)) | |
1213 nil) | |
1214 (nil nil nil nil) | |
1215 ]) | |
1216 | |
1217 ;; non-default bad HTML handling. | |
1218 (%in-text-ignore . ((p %heading) *discard *same error)) | |
1219 ) | |
1220 '( | |
1221 ;; A dummy element that will contain *document. | |
1222 ((*holder) | |
1223 (content-model . [(nil nil nil nil)])) | |
1224 ;; The root of the parse tree. We start with a pseudo-element | |
1225 ;; named *document for convenience. | |
1226 ((*document) | |
1227 (content-model . [(nil nil (((html) *include *next)) (html *next)) | |
1228 (nil | |
1229 nil | |
1230 nil | |
1231 (*include *same "after document end"))]) | |
1232 (end-tag-omissible . t)) | |
1233 ;; HTML O O (HEAD, BODY) | |
1234 ((html) | |
1235 (content-model . [(nil | |
1236 nil | |
1237 (((head) *include *next)) | |
1238 (head *next)) | |
1239 (nil | |
1240 nil | |
1241 (((body) *include *next) | |
1242 ;; Netscape stuff | |
1243 ((frameset) *include 4) | |
1244 ) | |
1245 (body *next)) | |
1246 (nil | |
1247 nil | |
1248 (((plaintext) *include *next)) | |
1249 (*retry *next)) | |
1250 (nil | |
1251 nil | |
1252 nil | |
1253 (*include *same "after BODY")) | |
1254 (nil | |
1255 nil | |
1256 nil | |
1257 (*include *same "after FRAMESET")) | |
1258 ]) | |
1259 (end-tag-omissible . t)) | |
1260 ((head) | |
1261 (content-model . [((title isindex %headempty style %head-deprecated) | |
1262 nil | |
1263 nil | |
1264 ;; *** Should only close if tag can | |
1265 ;; legitimately follow head. So many can that | |
1266 ;; I haven't bothered to enumerate them. | |
1267 (*close))]) | |
1268 (end-tag-omissible . t)) | |
1269 ;; TITLE - - (#PCDATA) | |
1270 ((title) | |
1271 (content-model . RCDATA ; not official | |
1272 ;; [((*data) include-space nil nil)] | |
1273 )) | |
1274 ;; STYLE - O (#PCDATA) | |
1275 ;; STYLE needs to be #PCDATA to allow omitted end tag. Bleagh. | |
1276 ((style) | |
1277 (content-model . [((*data) | |
1278 include-space | |
1279 nil | |
1280 ;; *** Should only close if tag can | |
1281 ;; legitimately follow style. So many can that | |
1282 ;; I haven't bothered to enumerate them. | |
1283 (*close))]) | |
1284 (end-tag-omissible . t)) | |
1285 ((body) | |
1286 (content-model . [((banner) nil nil (*retry *next)) | |
1287 ((bodytext) nil nil (bodytext *next)) | |
1288 (nil nil (((plaintext) *close)) nil)]) | |
1289 (inclusions . (spot)) | |
1290 (end-tag-omissible . t)) | |
1291 ;; Do I really want to include BODYTEXT? It has something to do | |
1292 ;; with mixed content screwing things up, and I don't understand | |
1293 ;; it. Wait! It's used by BQ! | |
1294 ((bodytext) | |
1295 (content-model . [((%body.content) | |
1296 nil | |
1297 ;; Push <P> before data characters. Non-SGML. | |
1298 (((%text) p) | |
1299 ;; Closing when seeing CREDIT is a stupidity | |
1300 ;; caused by BQ's sharing of BODYTEXT. BQ | |
1301 ;; should have its own BQTEXT. | |
1302 ((credit plaintext) *close)) | |
1303 nil)]) | |
1304 (end-tag-omissible . t)) | |
1305 ((div banner) | |
1306 (content-model . [((%body.content) | |
1307 nil | |
1308 ;; Push <P> before data characters. Non-SGML. | |
1309 (((%text) p)) | |
1310 nil)])) | |
1311 ((address) | |
1312 (content-model . [((p) | |
1313 nil | |
1314 ;; Push <P> before data characters. Non-SGML. | |
1315 (((%text) p)) | |
1316 nil)])) | |
1317 ((%heading) | |
1318 (content-model . [((%text) | |
1319 include-space | |
1320 ((%in-text-ignore)) | |
1321 nil)])) | |
1322 ((p) | |
1323 (content-model . [((%text) | |
1324 include-space | |
1325 nil | |
1326 ;; *** Should only close if tag can | |
1327 ;; legitimately follow P. So many can that I | |
1328 ;; don't bother to enumerate here. | |
1329 (*close))]) | |
1330 (end-tag-omissible . t)) | |
1331 ((ul ol) | |
1332 (content-model . [((lh) | |
1333 nil | |
1334 (((li) *include *next)) | |
1335 (*retry *next)) | |
1336 ((li) | |
1337 nil | |
1338 ;; Push <LI> before data characters or block | |
1339 ;; elements. | |
1340 ;; Non-SGML. | |
1341 (((%text %block) li *same error)) | |
1342 nil)])) | |
1343 ((lh) | |
1344 (content-model . [((%text) | |
1345 include-space | |
1346 (((dd dt li) *close) | |
1347 (%in-text-ignore)) | |
1348 nil)]) | |
1349 (end-tag-omissible . t)) | |
1350 ((dir menu) | |
1351 (content-model . [((li) | |
1352 nil | |
1353 (((%text) li *same error)) | |
1354 nil)]) | |
1355 (exclusions . (%block)) | |
1356 (deprecated . t)) | |
1357 ((li) | |
1358 (content-model . [((%block) | |
1359 nil | |
1360 (((li) *close) | |
1361 ;; Push <P> before data characters. Non-SGML. | |
1362 ((%text) p)) | |
1363 nil)]) | |
1364 (end-tag-omissible . t) | |
1365 ;; Better bad HTML handling. | |
1366 ;; Technically, there are a few valid documents that this will | |
1367 ;; hose, because you can have H1 inside FORM inside LI. However, | |
1368 ;; I don't think that should be allowed anyway. | |
1369 (exclusions . (*discard "not allowed here" %heading))) | |
1370 ((dl) | |
1371 (content-model . [((lh) | |
1372 nil | |
1373 (((dt dd) *include *next)) | |
1374 (*retry *next)) | |
1375 ((dt dd) | |
1376 nil | |
1377 ;; Push <DD> before data characters or block | |
1378 ;; items. | |
1379 ;; Non-SGML. | |
1380 (((%text %block) dd *same error)) | |
1381 nil)])) | |
1382 ((dt) | |
1383 (content-model . [((%text) | |
1384 include-space | |
1385 (((dd dt) *close) | |
1386 (%in-text-ignore)) | |
1387 nil)]) | |
1388 (end-tag-omissible . t)) | |
1389 ;; DD is just like LI, but we treat it separately because it can be | |
1390 ;; followed by a different set of elements. | |
1391 ((dd) | |
1392 (content-model . [((%block) | |
1393 nil | |
1394 (((dt dd) *close) | |
1395 ;; Push <P> before data characters. Non-SGML. | |
1396 ((%text) p)) | |
1397 nil)]) | |
1398 (end-tag-omissible . t) | |
1399 ;; See comment with LI. | |
1400 (exclusions . (*discard "not allowed here" %heading))) | |
1401 ((pre) | |
1402 (content-model . [((%text hr) | |
1403 include-space | |
1404 ((%in-text-ignore)) | |
1405 nil)]) | |
1406 (exclusions . (%pre.exclusion))) | |
1407 ;; BLOCKQUOTE deprecated, BQ okay | |
1408 ((bq) | |
1409 (content-model . %bq-content-model)) | |
1410 ((blockquote) | |
1411 (content-model . %bq-content-model) | |
1412 ;; BLOCKQUOTE is deprecated in favor of BQ in the HTML 3.0 DTD. | |
1413 ;; However, BQ is not even mentioned in the HTML 2.0 DTD. So I | |
1414 ;; don't think we can enable this yet: | |
1415 ;;(deprecated . t) | |
1416 ) | |
1417 ((fn note) | |
1418 (content-model . [((%body.content) | |
1419 nil | |
1420 ;; Push <P> before data characters. Non-SGML. | |
1421 (((%text) p)) | |
1422 nil)])) | |
1423 ((fig) | |
1424 (content-model . [((overlay) nil nil (*retry *next)) | |
1425 (nil | |
1426 nil | |
1427 (((caption) *include *next)) | |
1428 (*retry *next)) | |
1429 (nil | |
1430 nil | |
1431 (((figtext) *include *next) | |
1432 ((credit) *retry *next)) | |
1433 ;; *** Should only do this for elements that | |
1434 ;; can be in FIGTEXT. | |
1435 (figtext *next)) | |
1436 (nil nil (((credit) *include *next)) nil) | |
1437 (nil nil nil nil)])) | |
1438 ((caption credit) | |
1439 (content-model . [((%text) | |
1440 nil | |
1441 ((%in-text-ignore)) | |
1442 nil)])) | |
1443 ((figtext) | |
1444 (content-model . [((%body.content) | |
1445 nil | |
1446 ;; Push <P> before data characters. Very non-SGML. | |
1447 (((%text) p) | |
1448 ((credit) *close)) | |
1449 nil)]) | |
1450 (end-tag-omissible . t)) | |
1451 ((%emacsw3-crud) | |
1452 (content-model . EMPTY)) | |
1453 ((%netscape-crud) | |
1454 ;; Special non-SGML treatment of Netscape's shit. | |
1455 (content-model . XINHERIT)) | |
1456 ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA) | |
1457 ((form) | |
1458 ;; Same as BODY. Ugh! | |
1459 (content-model . [((%body.content) | |
1460 nil | |
1461 ;; Push <P> before data characters. Non-SGML. | |
1462 (((%text) p)) | |
1463 nil)]) | |
1464 (exclusions . (form)) | |
1465 (inclusions . (input select textarea keygen label))) | |
1466 ;; *** Where is the URL describing this? | |
1467 ((label) | |
1468 (content-model . [((%text) | |
1469 include-space | |
1470 nil | |
1471 nil)]) | |
1472 ;; *** These are already included, no need to repeat. | |
1473 ;;(inclusions . (input select textarea)) | |
1474 ;; *** Is a LABEL allowed inside a LABEL? I assume no. | |
1475 (exclusions . (label)) | |
1476 ;; The next line just does the default so is unneeded: | |
1477 ;;(end-tag-omissible . nil) | |
1478 ) | |
1479 ;; SELECT - - (OPTION+) -(INPUT|KEYGEN|TEXTAREA|SELECT)> | |
1480 ;; *** This should be -(everything). | |
1481 ((select) | |
1482 (content-model . [((option) nil nil nil)]) | |
1483 (exclusions . (input label select keygen textarea))) | |
1484 ;; option - O (#PCDATA) | |
1485 ;; needs to be #PCDATA to allow omitted end tag. | |
1486 ((option) | |
1487 ;; I'd like to make this RCDATA to avoid problems with inclusions | |
1488 ;; like SPOT, but that would conflict with the omitted end-tag, I | |
1489 ;; think. | |
1490 (content-model . [((*data) | |
1491 include-space | |
1492 (((option) *close)) | |
1493 nil)]) | |
1494 (end-tag-omissible . t)) | |
1495 ;; TEXTAREA - - (#PCDATA) -(INPUT|TEXTAREA|KEYGEN|SELECT) | |
1496 ((textarea) | |
1497 ;; Same comment as for OPTION about RCDATA. | |
1498 (content-model . [((*data) include-space nil nil)]) | |
1499 (exclusions . (input select label keygen textarea))) | |
1500 ((hr br img isindex input keygen overlay wbr spot tab | |
1501 %headempty %mathdelims) | |
1502 (content-model . EMPTY)) | |
1503 ((nextid) | |
1504 (content-model . EMPTY) | |
1505 (deprecated . t)) | |
1506 ((a) | |
1507 (content-model . [((%text) | |
1508 include-space | |
1509 (((%heading) | |
1510 *include *same "deprecated inside A") | |
1511 ;; *** I haven't made up my mind whether this | |
1512 ;; is a good idea. It can result in a lot of | |
1513 ;; bad formatting if the A is *never* closed. | |
1514 ;;((p) *discard *same error) | |
1515 ) | |
1516 nil)]) | |
1517 (exclusions . (a))) | |
1518 ((b %font %phrase %misc nobr %emacsw3-crud-nonempty) | |
1519 (content-model . [((%text) | |
1520 include-space | |
1521 ((%in-text-ignore)) | |
1522 nil)])) | |
1523 ((plaintext) | |
1524 (content-model . XXCDATA) | |
1525 (end-tag-omissible . t) | |
1526 (deprecated . obsolete)) | |
1527 ((xmp listing) | |
1528 (content-model . XCDATA) | |
1529 (deprecated . obsolete)) | |
1530 ;; Latest table spec (as of Nov. 13 1995) is at: | |
1531 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-tables-03.txt> | |
1532 ((table) | |
1533 (content-model . [(nil | |
1534 nil | |
1535 (((caption) *include *next) | |
1536 ((col colgroup thead tfoot tbody tr) *retry *next)) | |
1537 (*retry *next)) ;error handling | |
1538 ((col colgroup) | |
1539 nil | |
1540 (((thead tfoot tbody tr) *retry *next)) | |
1541 (*retry *next)) ;error handling | |
1542 (nil | |
1543 nil | |
1544 (((thead) *include *next) | |
1545 ((tfoot tbody tr) *retry *next)) | |
1546 (*retry *next)) ;error handling | |
1547 (nil | |
1548 nil | |
1549 (((tfoot) *include *next) | |
1550 ((tbody tr) *retry *next)) | |
1551 (*retry *next)) ;error handling | |
1552 ((tbody) | |
1553 nil | |
1554 (((tr) tbody *same) | |
1555 ;; error handling | |
1556 ((%body.content) tbody *same error)) | |
1557 nil)])) | |
1558 ((colgroup) | |
1559 (content-model . [((col) | |
1560 nil | |
1561 (((colgroup thead tfoot tbody tr) *close)) | |
1562 nil)]) | |
1563 (end-tag-omissible . t)) | |
1564 ((col) | |
1565 (content-model . EMPTY)) | |
1566 ((thead) | |
1567 (content-model . [((tr) | |
1568 nil | |
1569 (((tfoot tbody) *close) | |
1570 ;; error handling | |
1571 ((%body.content) tr *same error)) | |
1572 nil)]) | |
1573 (end-tag-omissible . t)) | |
1574 ((tfoot tbody) | |
1575 (content-model . [((tr) | |
1576 nil | |
1577 (((tbody) *close) | |
1578 ;; error handling | |
1579 ((%body.content) tr *same error)) | |
1580 nil)]) | |
1581 (end-tag-omissible . t)) | |
1582 ((tr) | |
1583 (content-model . [((td th) | |
1584 nil | |
1585 (((tr tfoot tbody) *close) | |
1586 ;; error handling | |
1587 ((%body.content) td *same error)) | |
1588 nil)]) | |
1589 (end-tag-omissible . t)) | |
1590 ((td th) | |
1591 ;; Arrgh! Another %body.content!!! Stupid!!! | |
1592 (content-model . [((%body.content) | |
1593 nil | |
1594 (((td th tr tfoot tbody) *close) | |
1595 ;; Push <P> before data characters. Non-SGML. | |
1596 ((%text) p)) | |
1597 nil)]) | |
1598 (end-tag-omissible . t)) | |
1599 ((math) | |
1600 (content-model . [((*data) include-space nil nil)]) | |
1601 (overrides . | |
1602 ((w3-p-d-shortref-chars t . "\{_^") | |
1603 (w3-p-d-shortrefs t . (("\\^" . "<sup>") | |
1604 ("_" . "<sub>") | |
1605 ("{" . "<box>"))))) | |
1606 (inclusions . (%math)) | |
1607 (exclusions . (%notmath))) | |
1608 ((sup) | |
1609 (content-model . [((%text) | |
1610 include-space | |
1611 ((%in-text-ignore)) | |
1612 nil)]) | |
1613 (overrides . | |
1614 ((w3-p-d-shortref-chars t . "\{_^") | |
1615 (w3-p-d-shortrefs t . (("\\^" . "</sup>") | |
1616 ("_" . "<sub>") | |
1617 ("{" . "<box>")))))) | |
1618 ((sub) | |
1619 (content-model . [((%text) | |
1620 include-space | |
1621 ((%in-text-ignore)) | |
1622 nil)]) | |
1623 (overrides . | |
1624 ((w3-p-d-shortref-chars t . "\{_^") | |
1625 (w3-p-d-shortrefs t . (("\\^" . "<sup>") | |
1626 ("_" . "</sub>") | |
1627 ("{" . "<box>")))))) | |
1628 ((box) | |
1629 (content-model . [((%formula) | |
1630 include-space | |
1631 (((left) *include 1) | |
1632 ((over atop choose) *include 2) | |
1633 ((right) *include 3)) | |
1634 nil) | |
1635 ((%formula) | |
1636 include-space | |
1637 (((over atop choose) *include 2) | |
1638 ((right) *include 3)) | |
1639 nil) | |
1640 ((%formula) | |
1641 include-space | |
1642 (((right) *include 3)) | |
1643 nil) | |
1644 ((%formula) include-space nil nil)]) | |
1645 (overrides . | |
1646 ((w3-p-d-shortref-chars t . "{}_^") | |
1647 (w3-p-d-shortrefs t . (("\\^" . "<sup>") | |
1648 ("_" . "<sub>") | |
1649 ("{" . "<box>") | |
1650 ("}" . "</box>")))))) | |
1651 ((above below %mathvec t bt sqrt) | |
1652 (content-model . [((%formula) include-space nil nil)])) | |
1653 ;; ROOT has a badly-specified content-model in HTML 3.0. | |
1654 ((root) | |
1655 (content-model . [((%formula) | |
1656 include-space | |
1657 (((of) *include *next)) | |
1658 nil) | |
1659 ((%formula) include-space nil nil)])) | |
1660 ((of) | |
1661 (content-model . [((%formula) include-space nil nil)]) | |
1662 ;; There is no valid way to infer a missing end-tag for OF. This | |
1663 ;; is bizarre. | |
1664 (end-tag-omissible . t)) | |
1665 ((array) | |
1666 (content-model . [((row) nil nil nil)])) | |
1667 ((row) | |
1668 (content-model . [((item) nil (((row) *close)) nil)]) | |
1669 (end-tag-omissible . t)) | |
1670 ((item) | |
1671 (content-model . [((%formula) | |
1672 include-space | |
1673 (((row item) *close)) | |
1674 nil)]) | |
1675 (end-tag-omissible . t)) | |
1676 ;; The old parser would look for the </EMBED> end-tag and include | |
1677 ;; the contents between <EMBED> and </EMBED> as the DATA attribute | |
1678 ;; of the EMBED start-tag. However, it did not require the | |
1679 ;; </EMBED> end-tag and did nothing if it was missing. This is | |
1680 ;; completely impossible to specify in SGML. | |
1681 ;; | |
1682 ;; See | |
1683 ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0603.html> | |
1684 ;; | |
1685 ;; Questions: Does EMBED require the end-tag? How does NOEMBED fit | |
1686 ;; into this? Where can EMBED appear? | |
1687 ;; | |
1688 ;; Nov. 25 1995: a new spec for EMBED (also an I-D): | |
1689 ;; <URL:http://www.cs.princeton.edu/~burchard/www/interactive/> | |
1690 ;; | |
1691 ;; Here is my guess how to code EMBED: | |
1692 ((embed) | |
1693 (content-model . [((noembed) nil nil (*close))])) | |
1694 ((noembed) | |
1695 (content-model . [((%body.content) ; hack hack hack | |
1696 nil | |
1697 (((%text) p)) | |
1698 nil)])) | |
1699 ;; | |
1700 ;; FRAMESET is a Netscape thing. | |
1701 ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0588.html> | |
1702 ((frameset) | |
1703 (content-model . [((noframes frame frameset) nil nil nil)])) | |
1704 ((noframes) | |
1705 (content-model . [((%body.content) | |
1706 nil | |
1707 ;; Push <P> before data characters. Non-SGML. | |
1708 (((%text) p)) | |
1709 nil)])) | |
1710 ((frame) | |
1711 (content-model . EMPTY)) | |
1712 ;; | |
1713 ;; APPLET is a Java thing. | |
1714 ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README> | |
1715 ((applet) | |
1716 ;; I really don't want to add another ANY content-model. | |
1717 (content-model . XINHERIT) | |
1718 (inclusions . (param))) | |
1719 ((param) | |
1720 (content-model . EMPTY)) | |
1721 ;; backward compatibility with old Java. | |
1722 ((app) | |
1723 (content-model . EMPTY)) | |
1724 ;; Client-side image maps. | |
1725 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-seidman-clientsideimagemap-01.txt> | |
1726 ;; *** The only problem is that I don't know in what elements MAP | |
1727 ;; can appear, so none of this is reachable yet. | |
1728 ((map) | |
1729 (content-model . [((area) nil nil nil)])) | |
1730 ((area) | |
1731 (content-model . EMPTY)) | |
1732 ))))) | |
1733 | |
1734 | |
1735 ;;; | |
1736 ;;; Omitted tag inference using state transition tables. | |
1737 ;;; | |
1738 | |
1739 (eval-when-compile | |
1740 | |
1741 (w3-p-s-var-def w3-p-s-includep) | |
1742 (w3-p-s-var-def w3-p-s-state-transitions) | |
1743 (w3-p-s-var-def w3-p-s-transition) | |
1744 (w3-p-s-var-def w3-p-s-tran-list) | |
1745 (w3-p-s-var-def w3-p-s-content-model) | |
1746 (w3-p-s-var-def w3-p-s-except) | |
1747 ;; Uses free variables: | |
1748 ;; w3-p-d-current-element, w3-p-d-exceptions | |
1749 ;; Destroys free variables: | |
1750 ;; w3-p-s-includep, w3-p-s-state-transitions, w3-p-s-transition, | |
1751 ;; w3-p-s-tran-list, w3-p-s-content-model, w3-p-s-except | |
1752 ;; Returns t if the element or data characters should be included. | |
1753 ;; Returns nil if the element or data characters should be discarded. | |
1754 (defsubst w3-grok-tag-or-data (tag-name) | |
1755 (while | |
1756 (cond | |
1757 ((symbolp (setq w3-p-s-content-model | |
1758 (w3-element-content-model w3-p-d-current-element))) | |
1759 (or (and (memq w3-p-s-content-model | |
1760 '(CDATA RCDATA XCDATA XXCDATA)) | |
1761 (memq tag-name '(*data *space))) | |
1762 ;; *** Implement ANY. | |
1763 (error "impossible")) | |
1764 (setq w3-p-s-includep t) | |
1765 ;; Exit loop. | |
1766 nil) | |
1767 (t | |
1768 ;; We have a complex content model. | |
1769 ;; Cache some data from the element info structure. Format is: | |
1770 ;; (INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT) | |
1771 (setq w3-p-s-state-transitions | |
1772 (aref w3-p-s-content-model | |
1773 (w3-element-state w3-p-d-current-element))) | |
1774 | |
1775 ;; Optimize the common cases. | |
1776 (cond | |
1777 ((eq '*space tag-name) | |
1778 ;; Optimizing the (*space *discard *same nil) transition. | |
1779 (setq w3-p-s-includep (car (cdr w3-p-s-state-transitions))) | |
1780 ;; Don't loop. | |
1781 nil) | |
1782 ((and (not (setq w3-p-s-except | |
1783 (assq tag-name w3-p-d-exceptions))) | |
1784 (memq tag-name (car w3-p-s-state-transitions))) | |
1785 ;; Equivalent to a transition of (TAG *include *same nil). | |
1786 ;; So we are done, return t to caller. | |
1787 (setq w3-p-s-includep t) | |
1788 ;; Exit loop. | |
1789 nil) | |
1790 (t | |
1791 ;; The general case. | |
1792 (cond | |
1793 ;; Handle inclusions and exclusions. | |
1794 (w3-p-s-except | |
1795 (setq w3-p-s-transition (cdr w3-p-s-except))) | |
1796 ;; See if the transition is in the complex transitions | |
1797 ;; component. | |
1798 ((progn | |
1799 (setq w3-p-s-tran-list | |
1800 (car (cdr (cdr w3-p-s-state-transitions)))) | |
1801 (setq w3-p-s-transition nil) | |
1802 (while w3-p-s-tran-list | |
1803 (cond ((memq tag-name (car (car w3-p-s-tran-list))) | |
1804 ;; We've found a transition. | |
1805 (setq w3-p-s-transition | |
1806 (cdr (car w3-p-s-tran-list))) | |
1807 (setq w3-p-s-tran-list nil)) | |
1808 (t | |
1809 (setq w3-p-s-tran-list (cdr w3-p-s-tran-list))))) | |
1810 ;; Check if we found it. | |
1811 w3-p-s-transition) | |
1812 ;; body of cond clause empty | |
1813 ) | |
1814 ;; Try finding the transition in the DEFAULT component of the | |
1815 ;; transition table, but avoid doing this for unknown elements, | |
1816 ;; always use the default-default for them. | |
1817 ((and (or (eq '*data tag-name) | |
1818 (w3-known-element-p tag-name)) | |
1819 (setq w3-p-s-transition | |
1820 (nth 3 w3-p-s-state-transitions))) | |
1821 ;; body of cond clause empty | |
1822 ) | |
1823 (t | |
1824 ;; Supply a default-default transition. | |
1825 (if (not (or (eq '*data tag-name) | |
1826 (w3-known-element-p tag-name))) | |
1827 (setq w3-p-s-transition | |
1828 '(*discard *same "unknown element")) | |
1829 | |
1830 ;; Decide whether to *close or *discard | |
1831 ;; based on whether this element would be | |
1832 ;; accepted as valid in an open ancestor. | |
1833 (let ((open-list w3-p-d-open-element-stack) | |
1834 (all-end-tags-omissible | |
1835 (w3-element-end-tag-omissible w3-p-d-current-element)) | |
1836 state-transitions tran-list) | |
1837 (if (catch 'found | |
1838 (while open-list | |
1839 (setq state-transitions | |
1840 (aref (w3-element-content-model | |
1841 (car open-list)) | |
1842 (w3-element-state (car open-list)))) | |
1843 (if (memq tag-name (car state-transitions)) | |
1844 (throw 'found t)) | |
1845 (setq tran-list (nth 2 state-transitions)) | |
1846 (while tran-list | |
1847 (cond ((memq tag-name (car (car tran-list))) | |
1848 (if (not (nth 3 (car tran-list))) | |
1849 ;; Not an error transition. | |
1850 (throw 'found t)) | |
1851 (setq tran-list nil)) | |
1852 (t | |
1853 (setq tran-list (cdr tran-list))))) | |
1854 ;; The input item is not accepted in this | |
1855 ;; ancestor. Try again in next ancestor. | |
1856 (or (w3-element-end-tag-omissible (car open-list)) | |
1857 (setq all-end-tags-omissible nil)) | |
1858 (setq open-list (cdr open-list))) | |
1859 nil) | |
1860 (setq w3-p-s-transition | |
1861 (if (w3-element-end-tag-omissible | |
1862 w3-p-d-current-element) | |
1863 (if all-end-tags-omissible | |
1864 ;; Probably indicates a need to debug | |
1865 ;; the DTD state-transition tables. | |
1866 '(*close *same | |
1867 "missing transition in DTD?") | |
1868 ;; Error will be reported later. | |
1869 '(*close *same)) | |
1870 '(*close *same "not allowed here"))) | |
1871 (setq w3-p-s-transition | |
1872 '(*discard *same "not allowed here"))))))) | |
1873 | |
1874 ;; We have found a transition to take. The transition is of | |
1875 ;; the format (ACTION NEW-STATE ERRORP) where the latter two | |
1876 ;; items are optional. | |
1877 | |
1878 ;; First, handle any state-change. | |
1879 (or (memq (car-safe (cdr w3-p-s-transition)) '(nil *same)) | |
1880 (w3-set-element-state | |
1881 w3-p-d-current-element | |
1882 (if (eq '*next (car-safe (cdr w3-p-s-transition))) | |
1883 (1+ (w3-element-state w3-p-d-current-element)) | |
1884 (car-safe (cdr w3-p-s-transition))))) | |
1885 | |
1886 ;; Handle any error message. | |
1887 (if (car-safe (cdr-safe (cdr w3-p-s-transition))) | |
1888 (w3-debug-html | |
1889 :mandatory-if (and (eq '*data tag-name) | |
1890 (eq '*discard (car w3-p-s-transition))) | |
1891 (format "Bad %s [%s], %s" | |
1892 (if (eq '*data tag-name) | |
1893 "data characters" | |
1894 (concat "start-tag " | |
1895 (w3-sgml-name-to-string tag-name))) | |
1896 (if (stringp (car (cdr (cdr w3-p-s-transition)))) | |
1897 (car (cdr (cdr w3-p-s-transition))) | |
1898 "not allowed here") | |
1899 (let ((action (car w3-p-s-transition))) | |
1900 (cond ((eq '*discard action) | |
1901 "discarding bad item") | |
1902 ((eq '*close action) | |
1903 (concat "inferring </" | |
1904 (w3-sgml-name-to-string | |
1905 (w3-element-name | |
1906 w3-p-d-current-element)) | |
1907 ">")) | |
1908 ((eq '*include action) | |
1909 "including bad item anyway") | |
1910 ((eq '*retry action) | |
1911 "*retry ??? you shouldn't see this") | |
1912 (t | |
1913 (concat "inferring <" | |
1914 (w3-sgml-name-to-string action) | |
1915 ">"))))))) | |
1916 | |
1917 ;; Handle the action. | |
1918 (cond | |
1919 ((eq '*include (car w3-p-s-transition)) | |
1920 (setq w3-p-s-includep t) | |
1921 ;; Exit loop. | |
1922 nil) | |
1923 ((eq '*close (car w3-p-s-transition)) | |
1924 ;; Perform end-tag inference. | |
1925 (w3-close-element) ; don't pass parameter | |
1926 ;; Loop and try again in parent element's content-model. | |
1927 t) | |
1928 ((eq '*discard (car w3-p-s-transition)) | |
1929 (setq w3-p-s-includep nil) | |
1930 ;; Exit loop. | |
1931 nil) | |
1932 ((eq '*retry (car w3-p-s-transition)) | |
1933 ;; Loop and try again after state change. | |
1934 t) | |
1935 ((symbolp (car w3-p-s-transition)) | |
1936 ;; We need to open another element to contain the text, | |
1937 ;; probably a <P> (look in the state table). | |
1938 (w3-open-element (car w3-p-s-transition) nil) | |
1939 ;; Now we loop and try again in the new element's | |
1940 ;; content-model. | |
1941 t) | |
1942 (t | |
1943 (error "impossible"))))))) | |
1944 | |
1945 ;; Empty while loop body. | |
1946 ) | |
1947 | |
1948 ;; Return value to user indicating whether to include or discard item: | |
1949 ;; t ==> include | |
1950 ;; nil ==> discard | |
1951 w3-p-s-includep) | |
1952 | |
1953 ) | |
1954 | |
1955 | |
1956 ;;; | |
1957 ;;; Main parser. | |
1958 ;;; | |
1959 | |
1960 (defvar w3-last-parse-tree nil | |
1961 "Used for debugging only. Stores the most recently computed parse tree | |
1962 \(a tree, not a parse tag stream\).") | |
1963 | |
1964 (defun w3-display-parse-tree (&optional ptree) | |
1965 (interactive) | |
1966 (with-output-to-temp-buffer "W3 HTML Parse Tree" | |
1967 (set-buffer standard-output) | |
1968 (emacs-lisp-mode) | |
1969 (require 'pp) | |
1970 (pp (or ptree w3-last-parse-tree)))) | |
1971 | |
1972 (defalias 'w3-display-last-parse-tree 'w3-display-parse-tree) | |
1973 | |
1974 ;; For compatibility with the old parser interface. | |
1975 (defalias 'w3-preparse-buffer 'w3-parse-buffer) | |
1976 | |
1977 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
1978 ;; % % | |
1979 ;; % This is the *ONLY* valid entry point in this file! % | |
1980 ;; % DO NOT call any of the other functions! % | |
1981 ;; % % | |
1982 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
1983 (defun w3-parse-buffer (&optional buff nodraw) | |
1984 "Parse contents of BUFF as HTML. | |
1985 BUFF defaults to the value of url-working-buffer. | |
1986 Destructively alters contents of BUFF. | |
1987 Unless optional second argument NODRAW is non-nil, calls the display | |
1988 engine on the parsed HTML. | |
1989 Returns a data structure containing the parsed information." | |
1990 | |
1991 (set-buffer (or buff url-working-buffer)) | |
1992 (setq buff (current-buffer)) | |
1993 (set-syntax-table w3-sgml-md-syntax-table) | |
1994 (buffer-disable-undo (current-buffer)) | |
1995 (widen) ; sanity checking | |
1996 (goto-char (point-min)) | |
1997 (setq case-fold-search t) ; allows smaller regexp patterns | |
1998 | |
1999 ;; Some unknown pre-parse buffer munging. | |
2000 (if (fboundp 'sera-to-fidel-marker) | |
2001 (let ((sera-being-called-by-w3 t)) | |
2002 ;; eval stops the compiler from complaining. | |
2003 (eval '(sera-to-fidel-marker)))) | |
2004 (goto-char (point-min)) | |
2005 | |
2006 ;; *** Should premunge line boundaries. | |
2007 ;; ******************** | |
2008 | |
2009 ;; Prepare another buffer to draw in unless told not to. | |
2010 (if (not nodraw) | |
2011 (w3-prepare-draw-buffer-for-parse-buffer)) | |
2012 | |
2013 (let* ( | |
2014 ;; Speed hack, see the variable doc string. | |
2015 (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0) | |
2016 (* w3-gc-cons-threshold-multiplier | |
2017 gc-cons-threshold) | |
2018 gc-cons-threshold)) | |
2019 | |
2020 ;; Used to determine if we made any progress since the last loop. | |
2021 (last-loop-start (point-min)) | |
2022 | |
2023 ;; How many iterations of the main loop have occurred. Used only | |
2024 ;; to send messages to the user periodically, since this function | |
2025 ;; can take some time. | |
2026 (loop-count 0) | |
2027 | |
2028 ;; Precomputing the loop-invariant parts of this for speed. | |
2029 (status-message-format | |
2030 (if url-show-status | |
2031 (format "Parsed %%3d%%%% of %d..." (- (point-max) (point-min))))) | |
2032 | |
2033 ;; Use a float value for 100 if possible, otherwise integer. | |
2034 ;; Determine which we can use outside of the loop for speed. | |
2035 (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100)) | |
2036 | |
2037 ;; Speed up checking whether to do incremental display. | |
2038 (w3-do-incremental-display (if nodraw nil w3-do-incremental-display)) | |
2039 | |
2040 ;; Used to convert parse tree to tag stream that old display | |
2041 ;; engine expects. Will change when display engine is rewritten. | |
2042 (parse-tag-stream '(*dummy)) | |
2043 | |
2044 ;; See doc string. | |
2045 (w3-p-d-parse-tag-stream-tail-pointer parse-tag-stream) | |
2046 | |
2047 ;; Points to cons cell in parse-tag-stream whose car is the last | |
2048 ;; item that has been sent to display engine. | |
2049 (parse-tag-stream-last-displayed-item parse-tag-stream) | |
2050 | |
2051 ;; The buffer which contains the HTML we are parsing. This | |
2052 ;; variable is used to avoid using the more expensive | |
2053 ;; save-excursion. | |
2054 (parse-buffer (current-buffer)) | |
2055 | |
2056 ;; Points to start of region of text since the previous tag. | |
2057 (between-tags-start (point-min)) | |
2058 | |
2059 ;; Points past end of region of text since the previous tag. Only | |
2060 ;; non-nil when the region has been completely determined and is | |
2061 ;; ready to be processed. | |
2062 between-tags-end | |
2063 | |
2064 ;; See doc string. | |
2065 w3-p-d-tag-name | |
2066 | |
2067 ;; See doc string. | |
2068 w3-p-d-end-tag-p | |
2069 | |
2070 ;; Is the tag we are looking at a null-end-tag-enabling | |
2071 ;; start-tag? | |
2072 net-tag-p | |
2073 | |
2074 ;; Attributes of the tag we are looking at. An alist whose items | |
2075 ;; are pairs of the form (SYMBOL . STRING). | |
2076 tag-attributes | |
2077 | |
2078 ;; Points past end of attribute value we are looking at. Points | |
2079 ;; past the syntactic construct, not the value of the attribute, | |
2080 ;; which may be at (1- attribute-value-end). | |
2081 attribute-value-end | |
2082 | |
2083 ;; Points past end of tag we are looking at. | |
2084 tag-end | |
2085 | |
2086 ;; See doc string. | |
2087 (w3-p-d-current-element (w3-fresh-element-for-tag '*document)) | |
2088 | |
2089 ;; See doc string. | |
2090 (w3-p-d-open-element-stack (list (w3-fresh-element-for-tag '*holder))) | |
2091 | |
2092 ;; ***not implemented yet*** | |
2093 (marked-section-undo-stack nil) | |
2094 | |
2095 ;; See doc string. | |
2096 (w3-p-d-debug-url t) | |
2097 | |
2098 ;; Any of the following variables with the comment ";*NESTED*" | |
2099 ;; are syntactic or semantic features that were introduced by | |
2100 ;; some containing element or marked section which will be undone | |
2101 ;; when we close that element or marked section. | |
2102 | |
2103 ;; See doc string. | |
2104 (w3-p-d-non-markup-chars nil) ;*NESTED* | |
2105 | |
2106 ;; See doc string. | |
2107 (w3-p-d-null-end-tag-enabled nil) ;*NESTED* | |
2108 | |
2109 ;; See doc string. | |
2110 (w3-p-d-in-parsed-marked-section nil) ;*NESTED* | |
2111 | |
2112 ;; See doc string. | |
2113 (w3-p-d-shortrefs nil) ;*NESTED* | |
2114 | |
2115 ;; See doc string. | |
2116 (w3-p-d-shortref-chars nil) ;*NESTED* | |
2117 | |
2118 ;; ******* maybe not needed. | |
2119 ;; | |
2120 ;; ;; Are we recognizing start-tags? | |
2121 ;; (recognizing-start-tags t) ;*NESTED* | |
2122 ;; | |
2123 ;; ;; Are we recognizing end-tags? If this is non-nil and not t, | |
2124 ;; ;; then only the end tag of the current open element is | |
2125 ;; ;; recognized. | |
2126 ;; (recognizing-end-tags t) ;*NESTED* | |
2127 | |
2128 ;; See doc string. | |
2129 (w3-p-d-exceptions nil) ;*NESTED* | |
2130 | |
2131 ;; Scratch variables used in this function | |
2132 ref attr-name attr-value content-model content open-list | |
2133 ) | |
2134 ;; Scratch variables used by macros and defsubsts we call. | |
2135 (w3-p-s-let-bindings | |
2136 | |
2137 (w3-update-non-markup-chars) | |
2138 | |
2139 ;; Main loop. Handle markup as follows: | |
2140 ;; | |
2141 ;; non-empty tag: Handle the region since the previous tag as PCDATA, | |
2142 ;; RCDATA, CDATA, if allowed by syntax. Then handle the tag. | |
2143 ;; | |
2144 ;; general entity (&name;): expand it and parse the result. | |
2145 ;; | |
2146 ;; shortref (_, {, }, and ^ in math stuff): Expand it and parse the | |
2147 ;; result. | |
2148 ;; | |
2149 ;; SGML marked section (<![ keywords [ conditional-text ]]>): Either | |
2150 ;; strip the delimiters and parse the result or delete. | |
2151 ;; | |
2152 ;; comment: Delete. | |
2153 ;; | |
2154 ;; empty tag (<>, </>): Handle as the appropriate tag. | |
2155 ;; | |
2156 ;; markup declaration (e.g. <!DOCTYPE ...>): Delete. | |
2157 ;; | |
2158 ;; SGML processing instruction (<?name>): Delete. | |
2159 ;; | |
2160 (while | |
2161 ;; Continue as long as we processed something last time and we | |
2162 ;; have more to process. | |
2163 (prog1 | |
2164 (not (and (= last-loop-start (point)) | |
2165 (eobp))) | |
2166 (setq last-loop-start (point))) | |
2167 | |
2168 ;; Display progress messages if asked and/or do incremental display | |
2169 ;; of results | |
2170 (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40)) | |
2171 (if w3-do-incremental-display | |
2172 (w3-pause)) | |
2173 (if status-message-format | |
2174 (message status-message-format | |
2175 ;; Percentage of buffer processed. | |
2176 (/ (* (point) one-hundred) (point-max)))))) | |
2177 | |
2178 ;; Go to next interesting thing in the buffer. | |
2179 (skip-chars-forward w3-p-d-non-markup-chars) | |
2180 | |
2181 ;; We are looking at a markup-starting character, and invalid | |
2182 ;; character, or end of buffer. | |
2183 (cond | |
2184 | |
2185 ((= ?< (following-char)) | |
2186 | |
2187 ;; We are looking at a tag, comment, markup declaration, SGML marked | |
2188 ;; section, SGML processing instruction, or non-markup "<". | |
2189 (forward-char) | |
2190 (cond | |
2191 | |
2192 ((looking-at "/?\\([a-z][-a-z0-9.]*\\)") | |
2193 ;; We are looking at a non-empty tag. | |
2194 | |
2195 (setq w3-p-d-tag-name | |
2196 (intern (downcase (buffer-substring (match-beginning 1) | |
2197 (match-end 1))))) | |
2198 (setq w3-p-d-end-tag-p (= ?/ (following-char))) | |
2199 (setq between-tags-end (1- (point))) | |
2200 (goto-char (match-end 0)) | |
2201 | |
2202 ;; Read the attributes from a start-tag. | |
2203 (or | |
2204 w3-p-d-end-tag-p | |
2205 | |
2206 ;; Attribute values can be: | |
2207 ;; "STRING" where STRING does not contain the double quote | |
2208 ;; 'STRING' where STRING does not contain the single quote | |
2209 ;; name-start character, *name character | |
2210 ;; *name character | |
2211 ;; Digit, +name character | |
2212 ;; +Digit | |
2213 ;; or a SPACE-separated list of one of the last four | |
2214 ;; possibilities (there is a comment somewhere that this is a | |
2215 ;; misinterpretation of the grammar, so we ignore this | |
2216 ;; possibility). | |
2217 (while | |
2218 (looking-at | |
2219 (eval-when-compile | |
2220 (concat | |
2221 ;; Leading whitespace. | |
2222 "[ \n\r\t]*" | |
2223 ;; The attribute name, possibly with a bad syntax | |
2224 ;; component. | |
2225 "\\([a-z][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)" | |
2226 ;; Trailing whitespace and perhaps an "=". | |
2227 "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)"))) | |
2228 | |
2229 (cond ((/= (match-beginning 2) (match-end 2)) | |
2230 (w3-debug-html | |
2231 :nocontext | |
2232 (format "Bad attribute name syntax: %s" | |
2233 (buffer-substring (match-beginning 1) | |
2234 (match-end 1)))))) | |
2235 | |
2236 (setq attr-name | |
2237 (intern (downcase (buffer-substring (match-beginning 1) | |
2238 (match-end 1))))) | |
2239 (goto-char (match-end 0)) | |
2240 (cond | |
2241 ((< (match-beginning 4) (match-end 4)) | |
2242 ;; A value was specified (e.g. ATTRIBUTE=VALUE). | |
2243 (cond | |
2244 ((looking-at | |
2245 (eval-when-compile | |
2246 (concat | |
2247 ;; Literal with double quotes. | |
2248 "\"\\([^\"]*\\)\"" | |
2249 "\\|" | |
2250 ;; Literal with single quotes. | |
2251 "'\\([^']\\)*'" | |
2252 "\\|" | |
2253 ;; Handle bad HTML conflicting with NET-enabling | |
2254 ;; start-tags. | |
2255 "\\([-a-z0-9.]+/[-a-z0-9._/#]+\\)[ \t\n\r>]" | |
2256 "\\|" | |
2257 ;; SGML NAME-syntax attribute value. | |
2258 "\\([-a-z0-9.]+\\)[ \t\n\r></]" | |
2259 ))) | |
2260 (cond | |
2261 ((or (match-beginning 1) | |
2262 (match-beginning 2)) | |
2263 ;; We have an attribute value literal. | |
2264 (narrow-to-region (1+ (match-beginning 0)) | |
2265 (1- (match-end 0))) | |
2266 | |
2267 ;; In attribute value literals, EE and RS are ignored | |
2268 ;; and RE and SEPCHAR characters sequences are | |
2269 ;; replaced by SPACEs. | |
2270 ;; | |
2271 ;; (There is no way right now to get RS into one of | |
2272 ;; these so that it can be ignored. This is due to | |
2273 ;; our using Unix line-handling conventions.) | |
2274 (skip-chars-forward "^&\t\n\r") | |
2275 (if (eobp) | |
2276 nil | |
2277 ;; We must expand entities and replace RS, RE, | |
2278 ;; and SEPCHAR. | |
2279 (goto-char (point-min)) | |
2280 (while (progn | |
2281 (skip-chars-forward "^&") | |
2282 (not (eobp))) | |
2283 (w3-expand-entity-at-point-maybe)) | |
2284 (subst-char-in-region (point-min) (point-max) ?\t 32) | |
2285 (subst-char-in-region (point-min) (point-max) ?\n 32)) | |
2286 ;; Set this after we have changed the size of the | |
2287 ;; attribute. | |
2288 (setq attribute-value-end (1+ (point-max)))) | |
2289 ((match-beginning 4) | |
2290 (setq attribute-value-end (match-end 4)) | |
2291 (narrow-to-region (point) attribute-value-end)) | |
2292 ((match-beginning 3) | |
2293 (setq attribute-value-end (match-end 3)) | |
2294 (narrow-to-region (point) attribute-value-end) | |
2295 ;; Horribly illegal non-SGML handling of bad | |
2296 ;; HTML on the net. This can break valid HTML. | |
2297 (setq attr-value (buffer-substring (point) | |
2298 (match-end 3))) | |
2299 (w3-debug-html :nocontext | |
2300 (format "Evil attribute value syntax: %s" | |
2301 (buffer-substring (point-min) (point-max))))) | |
2302 (t | |
2303 (error "impossible")))) | |
2304 ((memq (following-char) '(?\" ?')) | |
2305 ;; Missing terminating quote character. | |
2306 (narrow-to-region (point) | |
2307 (progn | |
2308 (forward-char 1) | |
2309 (skip-chars-forward "^ \t\n\r'\"=<>") | |
2310 (setq attribute-value-end (point)))) | |
2311 (w3-debug-html :nocontext | |
2312 (format "Attribute value missing end quote: %s" | |
2313 (buffer-substring (point-min) (point-max)))) | |
2314 (narrow-to-region (1+ (point-min)) (point-max))) | |
2315 (t | |
2316 ;; We have a syntactically invalid attribute value. Let's | |
2317 ;; make a best guess as to what the author intended. | |
2318 (narrow-to-region (point) | |
2319 (progn | |
2320 (skip-chars-forward "^ \t\n\r'\"=<>") | |
2321 (setq attribute-value-end (point)))) | |
2322 (w3-debug-html :nocontext | |
2323 (format "Bad attribute value syntax: %s" | |
2324 (buffer-substring (point-min) (point-max)))))) | |
2325 ;; Now we have isolated the attribute value. We need to | |
2326 ;; munge the value depending on the syntax of the | |
2327 ;; attribute. | |
2328 ;; *** Right now, we only implement the necessary munging | |
2329 ;; for CDATA attributes, which is none. I'm not sure why | |
2330 ;; this happens to work for other attributes right now. | |
2331 ;; For any other kind of attribute, we are supposed to | |
2332 ;; * smash case | |
2333 ;; * remove leading/trailing whitespace | |
2334 ;; * smash multiple space sequences into single spaces | |
2335 ;; * verify the syntax of each token | |
2336 (setq attr-value (buffer-substring (point-min) (point-max))) | |
2337 (widen) | |
2338 (goto-char attribute-value-end)) | |
2339 (t | |
2340 ;; No value was specified, in which case NAME should be | |
2341 ;; taken as ATTRIBUTE=NAME where NAME is one of the | |
2342 ;; enumerated values for ATTRIBUTE. | |
2343 ;; We assume here that ATTRIBUTE is the same as NAME. | |
2344 ;; *** Another piece of code will fix the attribute name if it | |
2345 ;; is wrong. | |
2346 (setq attr-value (symbol-name attr-name)))) | |
2347 | |
2348 ;; Accumulate the attributes. | |
2349 (setq tag-attributes (cons (cons attr-name attr-value) | |
2350 tag-attributes)))) | |
2351 | |
2352 ;; Process the end of the tag. | |
2353 (skip-chars-forward " \t\n\r") | |
2354 (cond ((= ?> (following-char)) | |
2355 ;; Ordinary tag end. | |
2356 (forward-char 1)) | |
2357 ((and (= ?/ (following-char)) | |
2358 (not w3-p-d-end-tag-p)) | |
2359 ;; This is a NET-enabling start-tag. | |
2360 (setq net-tag-p t) | |
2361 (forward-char 1)) | |
2362 ((= ?< (following-char)) | |
2363 ;; *** Strictly speaking, the following text has to | |
2364 ;; lexically be STAGO or ETAGO, which means that it | |
2365 ;; can't match some other lexical unit. | |
2366 ;; Unclosed tag. | |
2367 nil) | |
2368 (t | |
2369 ;; Syntax error. | |
2370 (w3-debug-html | |
2371 (format "Bad unclosed %s%s tag" | |
2372 (if w3-p-d-end-tag-p "/" "") | |
2373 (w3-sgml-name-to-string w3-p-d-tag-name))))) | |
2374 | |
2375 (setq tag-end (point))) | |
2376 | |
2377 ((looking-at "/?>") | |
2378 ;; We are looking at an empty tag (<>, </>). | |
2379 (setq w3-p-d-end-tag-p (= ?/ (following-char))) | |
2380 (setq w3-p-d-tag-name (if w3-p-d-end-tag-p | |
2381 (w3-element-name w3-p-d-current-element) | |
2382 ;; *** Strictly speaking, if OMITTAG NO, then | |
2383 ;; we should use the most recently closed tag. | |
2384 ;; But OMITTAG YES in HTML and I'm lazy. | |
2385 (w3-element-name w3-p-d-current-element))) | |
2386 (setq tag-attributes nil) | |
2387 ;; *** Make sure this is not at top level. | |
2388 (setq between-tags-end (1- (point))) | |
2389 (setq tag-end (match-end 0))) | |
2390 | |
2391 ;; *** In SGML, <(doctype)element> is valid tag syntax. This | |
2392 ;; cannot occur in HTML because the CONCUR option is off in the | |
2393 ;; SGML declaration. | |
2394 | |
2395 ((looking-at "!--") | |
2396 ;; We found a comment, delete to end of comment. | |
2397 (delete-region | |
2398 (1- (point)) | |
2399 (progn | |
2400 (forward-char 1) | |
2401 ;; Skip over pairs of -- ... --. | |
2402 (if (looking-at "\\(--[^-]*\\(-[^-]+\\)*--[ \t\r\n]*\\)+>") | |
2403 (goto-char (match-end 0)) | |
2404 ;; Syntax error! | |
2405 (w3-debug-html | |
2406 "Bad comment (unterminated or unbalanced \"--\" pairs)") | |
2407 (forward-char 2) | |
2408 (or (re-search-forward "--[ \t\r\n]*>" nil t) | |
2409 (search-forward ">" nil t))) | |
2410 (point)))) | |
2411 | |
2412 ((looking-at "!>\\|\\?[^>]*>") | |
2413 ;; We are looking at an empty comment or a processing | |
2414 ;; instruction. Delete it. | |
2415 (replace-match "") | |
2416 (delete-char -1)) | |
2417 | |
2418 ((looking-at "![a-z]") | |
2419 ;; We are looking at a markup declaration. Delete it. | |
2420 ;; *** Technically speaking, to handle valid HTML I think we | |
2421 ;; need to handle "<!USEMAP ... >" declarations. In the future, | |
2422 ;; to handle general SGML, we should parse "<!DOCTYPE ... >" | |
2423 ;; declarations as well (which can contain other declarations). | |
2424 ;; In the very distant future, perhaps we will handle "<!SGML | |
2425 ;; ... >" declarations. | |
2426 ;; *** Should warn if it's not SGML, DOCTYPE, or USEMAP. | |
2427 (backward-char 1) | |
2428 (delete-region | |
2429 (point) | |
2430 (progn | |
2431 (condition-case nil | |
2432 (forward-sexp 1) | |
2433 (error | |
2434 ;; *** This might not actually be bad syntax, but might | |
2435 ;; instead be a -- ... -- comment with unbalanced | |
2436 ;; parentheses somewhere inside the declaration. Handling | |
2437 ;; this properly would require full parsing of markup | |
2438 ;; declarations, a goal for the future. | |
2439 (w3-debug-html "Bad <! syntax.") | |
2440 (skip-chars-forward "^>") | |
2441 (if (= ?> (following-char)) | |
2442 (forward-char)))) | |
2443 (point)))) | |
2444 | |
2445 ((looking-at "!\\\[\\(\\([ \t\n\r]*[a-z]+\\)+[ \t\n\r]*\\)\\\[") | |
2446 ;; We are looking at a marked section. | |
2447 ;; *** Strictly speaking, we should issue a warning if the | |
2448 ;; keywords are invalid or missing or if the "[" does not follow. | |
2449 ;; We must look at the keywords to understand how to parse it. | |
2450 ;; *** Strictly speaking, we should perform parameter entity | |
2451 ;; substitution on the keywords first. | |
2452 (goto-char (match-beginning 1)) | |
2453 (insert ?\)) | |
2454 (goto-char (1- (match-beginning 0))) | |
2455 (delete-char 3) | |
2456 (insert ?\() | |
2457 (backward-char 1) | |
2458 (let* ((keywords (read (current-buffer))) | |
2459 ;; Multiple keywords may appear, but only the most | |
2460 ;; significant takes effect. Rank order is IGNORE, CDATA, | |
2461 ;; RCDATA, INCLUDE, and TEMP. INCLUDE and TEMP have the | |
2462 ;; same effect. | |
2463 (keyword (car-safe (cond ((memq 'IGNORE keywords)) | |
2464 ((memq 'CDATA keywords)) | |
2465 ((memq 'RCDATA keywords)) | |
2466 ((memq 'INCLUDE keywords)) | |
2467 ((memq 'TEMP keywords)))))) | |
2468 (or (= ?\[ (following-char)) | |
2469 ;; I probably shouldn't even check this, since it is so | |
2470 ;; impossible. | |
2471 (error "impossible")) | |
2472 (forward-char 1) | |
2473 (delete-region (1- (match-beginning 0)) (point)) | |
2474 (cond ((eq 'IGNORE keyword) | |
2475 ;; Scan forward skipping over matching <![ ... ]]> | |
2476 ;; until we find an unmatched "]]>". | |
2477 (let ((ignore-nesting 1) | |
2478 (start-pos (point))) | |
2479 (while (> ignore-nesting 0) | |
2480 (if (re-search-forward "<!\\\\\[\\|\]\]>" nil t) | |
2481 (setq ignore-nesting | |
2482 (if (eq ?> (preceding-char)) | |
2483 (1- ignore-nesting) | |
2484 (1+ ignore-nesting))) | |
2485 (w3-debug-html | |
2486 "Unterminated IGNORE marked section.") | |
2487 (setq ignore-nesting 0) | |
2488 (goto-char start-pos))) | |
2489 (delete-region start-pos (point)))) | |
2490 ((eq 'CDATA keyword) | |
2491 (error "***unimplemented***")) | |
2492 ((eq 'RCDATA keyword) | |
2493 (error "***unimplemented***")) | |
2494 ((memq keyword '(INCLUDE TEMP)) | |
2495 (error "***unimplemented***"))))) | |
2496 ((and (looking-at "!") | |
2497 w3-netscape-compatible-comments) | |
2498 ;; Horribly illegal non-SGML handling of bad HTML on the net. | |
2499 ;; This can break valid HTML. | |
2500 ;; This arises because Netscape discards anything looking like | |
2501 ;; "<!...>". So people expect they can use this construct as | |
2502 ;; a comment. | |
2503 (w3-debug-html "Evil <! comment syntax.") | |
2504 (backward-char 1) | |
2505 (delete-region | |
2506 (point) | |
2507 (progn | |
2508 (skip-chars-forward "^>") | |
2509 (if (= ?> (following-char)) | |
2510 (forward-char)) | |
2511 (point)))) | |
2512 (t | |
2513 ;; This < is not a markup character. Pretend we didn't notice | |
2514 ;; it at all. We have skipped over the < already, so just loop | |
2515 ;; again. | |
2516 ))) | |
2517 | |
2518 ((= ?& (following-char)) | |
2519 (w3-expand-entity-at-point-maybe)) | |
2520 | |
2521 ((and (= ?\] (following-char)) | |
2522 w3-p-d-in-parsed-marked-section | |
2523 (looking-at "]]>")) | |
2524 ;; *** handle the end of a parsed marked section. | |
2525 (error "***unimplemented***")) | |
2526 | |
2527 ((and (= ?/ (following-char)) | |
2528 w3-p-d-null-end-tag-enabled) | |
2529 ;; We are looking at a null end tag. | |
2530 (setq w3-p-d-end-tag-p t) | |
2531 (setq between-tags-end (point)) | |
2532 (setq tag-end (1+ (point))) | |
2533 (setq w3-p-d-tag-name (w3-element-name w3-p-d-current-element))) | |
2534 | |
2535 ;; This can be slow, since we'll hardly ever get here. | |
2536 ;; *** Strictly speaking, I think we're supposed to handle | |
2537 ;; shortrefs that begin with the same characters as other markup, | |
2538 ;; preferring the longest match. | |
2539 ;; I will assume that shortrefs never begin with <, &, \], /. | |
2540 ((setq ref (catch 'found-shortref | |
2541 (let ((refs w3-p-d-shortrefs)) | |
2542 (while refs | |
2543 (if (looking-at (car (car refs))) | |
2544 (throw 'found-shortref (cdr (car refs)))) | |
2545 (setq refs (cdr refs)))))) | |
2546 ;; We are looking at a shortref for which there is an | |
2547 ;; expansion defined in the current syntax. Replace with the | |
2548 ;; expansion, leaving point at the beginning so it will be parsed | |
2549 ;; on the next loop. | |
2550 ;; *** eek. This is wrong if the shortref is for an entity with | |
2551 ;; CDATA syntax which should not be reparsed for tags. | |
2552 (replace-match "") | |
2553 (let ((pt (point))) | |
2554 (insert ref) | |
2555 (goto-char pt))) | |
2556 | |
2557 ((looking-at (eval-when-compile | |
2558 (concat "[" (w3-invalid-sgml-chars) "]"))) | |
2559 (w3-debug-html | |
2560 (format "Invalid SGML character: %c" (following-char))) | |
2561 (insert (or (cdr-safe (assq (following-char) | |
2562 ;; These characters are apparently | |
2563 ;; from a Windows character set. | |
2564 '((146 . "'") | |
2565 (153 . "TM")))) | |
2566 "")) | |
2567 (delete-char 1)) | |
2568 | |
2569 ((eobp) | |
2570 ;; We have finished the buffer. Make sure we process the last | |
2571 ;; piece of text, if any. | |
2572 (setq between-tags-end (point)) | |
2573 ;; We have to test what's on the element stack because this | |
2574 ;; piece of code gets executed twice. | |
2575 (cond ((not (eq '*holder (w3-element-name w3-p-d-current-element))) | |
2576 ;; This forces the calculation of implied omitted end tags. | |
2577 (setq w3-p-d-tag-name '*document) | |
2578 (setq w3-p-d-end-tag-p t) | |
2579 (setq tag-end (point))))) | |
2580 | |
2581 (t | |
2582 (error "unreachable code, this can't happen"))) | |
2583 | |
2584 ;; If we have determined the boundaries of a non-empty between-tags | |
2585 ;; region of text, then handle it. | |
2586 (cond | |
2587 (between-tags-end | |
2588 (cond | |
2589 ((< between-tags-start between-tags-end) | |
2590 ;; We have a non-empty between-tags region. | |
2591 | |
2592 ;; We check if it's entirely whitespace, because we record the | |
2593 ;; transitions for whitespace separately from those for | |
2594 ;; data with non-whitespace characters. | |
2595 (goto-char between-tags-start) | |
2596 (skip-chars-forward " \t\n\r" between-tags-end) | |
2597 (cond | |
2598 ((w3-grok-tag-or-data (prog1 | |
2599 (if (= between-tags-end (point)) | |
2600 '*space | |
2601 '*data) | |
2602 (goto-char between-tags-end))) | |
2603 ;; We have to include the text in the current element's | |
2604 ;; contents. If this is the first item in the current | |
2605 ;; element's contents, don't include a leading newline if | |
2606 ;; there is one. Add a trailing newline as a separate text | |
2607 ;; item so that it can be removed later if it turns out to | |
2608 ;; be the last item in the current element's contents when | |
2609 ;; the current element is closed. | |
2610 ;; *** We could perform this test before calling | |
2611 ;; w3-grok-tag-or-data, but it's not clear which will be | |
2612 ;; faster in practice. | |
2613 (or (setq content (w3-element-content w3-p-d-current-element)) | |
2614 ;; *** Strictly speaking, in SGML the record end is | |
2615 ;; carriage return, not line feed. | |
2616 (if (= ?\n (char-after between-tags-start)) | |
2617 (setq between-tags-start (1+ between-tags-start)))) | |
2618 (if (= between-tags-start (point)) | |
2619 ;; Do nothing. | |
2620 nil | |
2621 ;; We are definitely going to add data characters to the | |
2622 ;; content. | |
2623 ;; Protocol is that all but last data character item | |
2624 ;; must have been sent to display engine. | |
2625 (and content | |
2626 (stringp (car content)) | |
2627 ;; Gross, disgusting hack to deal with old interface | |
2628 ;; to display engine. Remove as soon as possible. | |
2629 (not (memq (w3-element-name w3-p-d-current-element) | |
2630 '(plaintext style xmp textarea))) | |
2631 (w3-add-display-item 'text (car content))) | |
2632 (cond | |
2633 ((and (= ?\n (preceding-char)) | |
2634 (/= between-tags-start (1- (point)))) | |
2635 (setq content (cons (buffer-substring between-tags-start | |
2636 (1- (point))) | |
2637 content)) | |
2638 ;; Gross, disgusting hack to deal with old interface | |
2639 ;; to display engine. Remove as soon as possible. | |
2640 (or (memq (w3-element-name w3-p-d-current-element) | |
2641 '(plaintext style xmp textarea)) | |
2642 (w3-add-display-item 'text (car content))) | |
2643 (setq content (cons "\n" content))) | |
2644 (t | |
2645 (setq content (cons (buffer-substring between-tags-start | |
2646 (point)) | |
2647 content)))) | |
2648 (w3-set-element-content w3-p-d-current-element content)))))) | |
2649 | |
2650 (setq between-tags-end nil))) | |
2651 | |
2652 ;; If the previous expression modified (point), then it went to | |
2653 ;; the value of between-tags-end. | |
2654 | |
2655 ;; If we found a start or end-tag, we need to handle it. | |
2656 (cond | |
2657 (w3-p-d-tag-name | |
2658 | |
2659 ;; Move past the tag and prepare for next between-tags region. | |
2660 (goto-char tag-end) | |
2661 (setq between-tags-start (point)) | |
2662 | |
2663 (cond | |
2664 (w3-p-d-end-tag-p | |
2665 ;; Handle an end-tag. | |
2666 (if (eq w3-p-d-tag-name (w3-element-name w3-p-d-current-element)) | |
2667 (w3-close-element) | |
2668 ;; Handle the complex version. We have to search up (down?) | |
2669 ;; the open element stack to find the element that matches (if | |
2670 ;; any). Then we close all of the elements. On a conforming | |
2671 ;; SGML document this can do no wrong and it's not | |
2672 ;; unreasonable on a non-conforming document. | |
2673 | |
2674 ;; Can't safely modify stack until we know the element we want | |
2675 ;; to find is in there, so work with a copy. | |
2676 (setq open-list w3-p-d-open-element-stack) | |
2677 (while (and open-list | |
2678 (not (eq w3-p-d-tag-name | |
2679 (w3-element-name (car open-list))))) | |
2680 (setq open-list (cdr open-list))) | |
2681 (cond (open-list | |
2682 ;; We found a match. Pop elements. | |
2683 ;; We will use the following value as a sentinel. | |
2684 (setq open-list (cdr open-list)) | |
2685 (while (not (eq open-list w3-p-d-open-element-stack)) | |
2686 (w3-close-element t)) | |
2687 (w3-close-element)) | |
2688 (t | |
2689 ;; Bogus end tag. | |
2690 (w3-debug-html | |
2691 (format "Unmatched end-tag </%s>" | |
2692 (w3-sgml-name-to-string w3-p-d-tag-name))))))) | |
2693 (t | |
2694 ;; Handle a start-tag. | |
2695 (cond | |
2696 ;; Check if the new element is allowed in the current element's | |
2697 ;; content model. | |
2698 ((w3-grok-tag-or-data w3-p-d-tag-name) | |
2699 (w3-open-element w3-p-d-tag-name tag-attributes) | |
2700 | |
2701 ;; Handle NET-enabling start tags. | |
2702 (cond ((and net-tag-p | |
2703 (not w3-p-d-null-end-tag-enabled)) | |
2704 ;; Save old values. | |
2705 (w3-set-element-undo-list | |
2706 w3-p-d-current-element | |
2707 (cons (cons 'w3-p-d-non-markup-chars | |
2708 w3-p-d-non-markup-chars) | |
2709 (cons '(w3-p-d-null-end-tag-enabled . nil) | |
2710 (w3-element-undo-list w3-p-d-current-element)))) | |
2711 ;; Alter syntax. | |
2712 (setq w3-p-d-null-end-tag-enabled t) | |
2713 (w3-update-non-markup-chars))) | |
2714 | |
2715 (setq content-model | |
2716 (w3-element-content-model w3-p-d-current-element)) | |
2717 | |
2718 ;; If the element does not have parsed contents, then we | |
2719 ;; can find its contents immediately. | |
2720 (cond | |
2721 ((memq content-model '(EMPTY CDATA XCDATA XXCDATA RCDATA)) | |
2722 (cond | |
2723 ((eq 'EMPTY content-model) | |
2724 (w3-close-element)) | |
2725 ((eq 'CDATA content-model) | |
2726 ;; CDATA: all data characters until an end-tag. We'll | |
2727 ;; process the end-tag on the next loop. | |
2728 (if (re-search-forward (if w3-p-d-null-end-tag-enabled | |
2729 "</[a-z>]\\|/" | |
2730 "</[a-z>]") | |
2731 nil 'move) | |
2732 (goto-char (match-beginning 0)))) | |
2733 ((eq 'XCDATA content-model) | |
2734 ;; XCDATA: special non-SGML-standard mode which includes | |
2735 ;; all data characters until "</foo" is seen where "foo" | |
2736 ;; is the name of this element (for XMP and LISTING). | |
2737 (if (search-forward | |
2738 (concat "</" (symbol-name | |
2739 (w3-element-name w3-p-d-current-element))) | |
2740 nil 'move) | |
2741 (goto-char (match-beginning 0)))) | |
2742 ((eq 'XXCDATA content-model) | |
2743 ;; XXCDATA: special non-SGML-standard mode which includes | |
2744 ;; all data until end-of-entity (end-of-buffer for us) | |
2745 ;; (for PLAINTEXT). | |
2746 (goto-char (point-max))) | |
2747 ((eq 'RCDATA content-model) | |
2748 ;; RCDATA: all data characters until end-tag is seen, | |
2749 ;; except that entities are expanded first, although the | |
2750 ;; expansions are _not_ scanned for end-tags, although the | |
2751 ;; expansions _are_ scanned for further entity | |
2752 ;; references. | |
2753 (while (progn | |
2754 (if (re-search-forward (if w3-p-d-null-end-tag-enabled | |
2755 "</[a-z>]\\|[/&]" | |
2756 "</[a-z>]\\|&") | |
2757 nil 'move) | |
2758 (goto-char (match-beginning 0))) | |
2759 (= ?& (following-char))) | |
2760 (w3-expand-entity-at-point-maybe))))))) | |
2761 (t | |
2762 ;; The element is illegal here. We'll just discard the start | |
2763 ;; tag as though we never saw it. | |
2764 )))) | |
2765 | |
2766 (setq w3-p-d-tag-name nil) | |
2767 (setq w3-p-d-end-tag-p nil) | |
2768 (setq net-tag-p nil) | |
2769 (setq tag-attributes nil) | |
2770 (setq tag-end nil))) | |
2771 | |
2772 ;; Hand items to the display engine. | |
2773 (cond ((not nodraw) | |
2774 (set-buffer w3-draw-buffer) | |
2775 (while (not (eq parse-tag-stream-last-displayed-item | |
2776 w3-p-d-parse-tag-stream-tail-pointer)) | |
2777 (setq parse-tag-stream-last-displayed-item | |
2778 (cdr parse-tag-stream-last-displayed-item)) | |
2779 ;; We call w3-handle-single-tag from only one spot so that it | |
2780 ;; is reasonable to inline it, since it is a big function. | |
2781 (w3-handle-single-tag | |
2782 (car (car parse-tag-stream-last-displayed-item)) | |
2783 (cdr (car parse-tag-stream-last-displayed-item)))) | |
2784 (set-buffer parse-buffer))) | |
2785 | |
2786 ;; End of main while loop. | |
2787 ) | |
2788 | |
2789 ;; We have finished parsing the buffer! | |
2790 (if status-message-format | |
2791 (message "%sdone" (format status-message-format 100))) | |
2792 ;; Do this now so the user can see the full results before Emacs | |
2793 ;; goes off and garbage-collects for an hour. :-( | |
2794 (if w3-do-incremental-display | |
2795 (w3-pause)) | |
2796 | |
2797 ;; *** For debugging, save the true parse tree. | |
2798 ;; *** Make this look inside *DOCUMENT. | |
2799 (setq w3-last-parse-tree | |
2800 (w3-element-content w3-p-d-current-element)) | |
2801 | |
2802 ;; Return the parse in the format expected, a stream of tags | |
2803 ;; possibly with a buffer at the front. | |
2804 (if nodraw | |
2805 ;; Discard the *dummy item at start of list. | |
2806 (cdr parse-tag-stream) | |
2807 (cons w3-draw-buffer (cdr parse-tag-stream))) | |
2808 | |
2809 ))) | |
2810 | |
2811 | |
2812 ;;; | |
2813 ;;; Initialization of display engine to accept parser output. | |
2814 ;;; | |
2815 | |
2816 (defun w3-prepare-draw-buffer-for-parse-buffer () | |
2817 (setq list-buffers-directory nil) | |
2818 (let ((buf (get-buffer-create (url-generate-new-buffer-name | |
2819 "Untitled"))) | |
2820 (info (mapcar (function (lambda (x) (cons x (symbol-value x)))) | |
2821 w3-persistent-variables))) | |
2822 (setq w3-draw-buffer buf) | |
2823 (save-excursion | |
2824 (set-window-buffer (selected-window) buf) | |
2825 (set-buffer buf) | |
2826 (setq w3-draw-buffer (current-buffer)) | |
2827 (erase-buffer) | |
2828 (buffer-disable-undo (current-buffer)) | |
2829 (mapcar (function (lambda (x) (set (car x) (cdr x)))) info) | |
2830 (setq w3-last-fill-pos (point)) | |
2831 (setq fill-column (min (- (or w3-strict-width (window-width)) | |
2832 w3-right-border) | |
2833 (or w3-maximum-line-length (window-width)))) | |
2834 (setq fill-prefix "") | |
2835 (w3-init-state)))) | |
2836 | |
2837 | |
2838 | |
2839 (provide 'w3-parse) | |
2840 | |
2841 ;; Local variables: | |
2842 ;; indent-tabs-mode: nil | |
2843 ;; end: |