comparison lisp/w3/w3-parse.el @ 0:376386a54a3c r19-14

Import from CVS: tag r19-14
author cvs
date Mon, 13 Aug 2007 08:45:50 +0200
parents
children ac2d302a0011
comparison
equal deleted inserted replaced
-1:000000000000 0:376386a54a3c
1 ;; Created by: Joe Wells, jbw@csb.bu.edu
2 ;; Created on: Sat Sep 30 17:25:40 1995
3 ;; Filename: w3-parse.el
4 ;; Purpose: Parse HTML and/or SGML for Emacs W3 browser.
5
6 ;; Copyright © 1995 Joseph Brian Wells
7 ;; Copyright © 1993, 1994, 1995 by William M. Perry (wmperry@spry.com)
8 ;;
9 ;; This program is free software; you can redistribute it and/or modify
10 ;; it under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation; either version 2 of the License, or
12 ;; (at your option) any later version.
13 ;;
14 ;; This program is distributed in the hope that it will be useful,
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details.
18 ;;
19 ;; You should have received a copy of the GNU General Public License
20 ;; along with this program; if not, write to the Free Software
21 ;; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 ;;
23 ;; On November 13, 1995, the license was available at
24 ;; <URL:ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0>. It may still be
25 ;; obtainable via that URL.
26
27
28 ;;;
29 ;;; Trying to make the best of an evil speed hack.
30 ;;;
31
32 ;; Explanation:
33
34 ;; Basically, this file provides one big function (w3-parse-buffer) and
35 ;; some data structures. However, to avoid code redundancy, I have broken
36 ;; out some common subexpressions of w3-parse-buffer into separate
37 ;; functions. I have declared these separate functions with "defsubst" so
38 ;; they will be inlined into w3-parse-buffer. Also, I have defined them
39 ;; within eval-when-compile forms, so no definitions will be emitted into
40 ;; the .elc file for these separate functions. (They will work normally
41 ;; when the uncompiled file is loaded.)
42
43 ;; Each of these subfunctions use some scratch variables in a purely local
44 ;; fashion. In good software design, I would declare these variables as
45 ;; close to their use as possible with "let". However, "let"-binding
46 ;; variables is *SLOW* in Emacs Lisp, even when compiled. Since each of
47 ;; these functions is executed one or more time during each iteration of
48 ;; the main loop, I deemed this too expensive. So the main function does
49 ;; the "let"-binding of these variables. However, I still want to declare
50 ;; them close to their use, partially to keep the compiler from crying
51 ;; "Wolf!" when there is no danger (well, maybe a little danger :-), so I
52 ;; define some macros for this purpose.
53
54 ;; Also, there are some variables which are updated throughout the file
55 ;; (remember this is really all one function). Some of the code which
56 ;; updates them is located inside the subfunctions. So that the compiler
57 ;; will not complain, these variables are defined with defvar.
58
59 (require 'w3-vars)
60
61 (eval-when-compile
62 (defconst w3-p-s-var-list nil
63 "A list of the scratch variables used by functions called by
64 w3-parse-buffer which it is w3-parse-buffer's responsibility to
65 \"let\"-bind.")
66
67 ;; *** This is unused and does not belong right here anyway.
68 (defmacro w3-resolve-numeric-entity (ent)
69 "Return a string representing the numeric entity ENT (&#ENT;)"
70 (` (if (< (, ent) 256)
71 (char-to-string (, ent))
72 (format "[Too large character: %s]" (, ent)))))
73
74 (defmacro w3-p-s-var-def (var)
75 "Declare VAR as a scratch variable which w3-parse-buffer must
76 \"let\"-bind."
77 (` (eval-when-compile
78 (defvar (, var))
79 (or (memq '(, var) w3-p-s-var-list)
80 (setq w3-p-s-var-list (cons '(, var) w3-p-s-var-list))))))
81
82 (defmacro w3-p-s-let-bindings (&rest body)
83 "\"let\"-bind all of the variables in w3-p-s-var-list in BODY."
84 (` (let (, w3-p-s-var-list)
85 (,@ body))))
86 (put 'w3-p-s-let-bindings 'lisp-indent-function 0)
87 (put 'w3-p-s-let-bindings 'edebug-form-spec t)
88
89 (defvar w3-p-d-current-element)
90 (put 'w3-p-d-current-element 'variable-documentation
91 "Information structure for the current open element.")
92
93 (defvar w3-p-d-exceptions)
94 (put 'w3-p-d-exceptions 'variable-documentation
95 "Alist specifying elements (dis)allowed because of an (ex|in)clusion
96 exception of some containing element (not necessarily the immediately
97 containing element). Each item specifies a transition for an element
98 which overrides that specified by the current element's content model.
99 Each item is of the form (TAG ACTION *same ERRORP).")
100
101 (defvar w3-p-d-in-parsed-marked-section)
102 (put 'w3-p-d-in-parsed-marked-section 'variable-documentation
103 "Are we in a parsed marked section so that we have to scan for \"]]>\"?")
104
105 (defvar w3-p-d-non-markup-chars)
106 (put 'w3-p-d-non-markup-chars 'variable-documentation
107 "The characters that do not indicate the start of markup, in the format
108 for an argument to skip-chars-forward.")
109
110 (defvar w3-p-d-null-end-tag-enabled)
111 (put 'w3-p-d-null-end-tag-enabled 'variable-documentation
112 "Is the null end tag (\"/\") enabled?")
113
114 (defvar w3-p-d-open-element-stack)
115 (put 'w3-p-d-open-element-stack 'variable-documentation
116 "A stack of the currently open elements, with the innermost enclosing
117 element on top and the outermost on bottom.")
118
119 (defvar w3-p-d-parse-tag-stream-tail-pointer)
120 (put 'w3-p-d-parse-tag-stream-tail-pointer 'variable-documentation
121 "Points to last cons cell in parse-tag stream. We add items to tail of
122 parse-tag-stream instead of head.")
123
124 (defvar w3-p-d-shortrefs)
125 (put 'w3-p-d-shortrefs 'variable-documentation
126 "An alist of the magic entity reference strings in the current
127 between-tags region and their replacements. Each item is of the format
128 \(REGEXP . REPLACEMENT-STRING\). Although in SGML shortrefs normally name
129 entities whose value should be used as the replacement, we have
130 preexpanded the entities for speed. We have also regexp-quoted the
131 strings to be replaced, so they can be used with looking-at. This should
132 never be in an element's overrides field unless
133 w3-p-d-shortref-chars is also in the field.")
134
135 (defvar w3-p-d-shortref-chars)
136 (put 'w3-p-d-shortref-chars 'variable-documentation
137 "A string of the characters which can start shortrefs in the current
138 between-tags region. This must be in a form which can be passed to
139 skip-chars-forward and must contain exactly the characters which start the
140 entries in w3-p-d-shortrefs. If this variable is mentioned in the
141 overrides field of an element, its handling is magical in that the
142 variable w3-p-d-non-markup-chars is saved to the element's undo-list and
143 updated at the same time. This should never be in an element's overrides
144 field unless w3-p-d-shortrefs is also in the field.")
145
146 (defvar w3-p-d-tag-name)
147 (put 'w3-p-d-tag-name 'variable-documentation
148 "Name of tag we are looking at, as an Emacs Lisp symbol.
149 Only non-nil when we are looking at a tag.")
150
151 (defvar w3-p-d-end-tag-p)
152 (put 'w3-p-d-end-tag-p 'variable-documentation
153 "Is the tag we are looking at an end tag?
154 Only non-nil when we are looking at a tag.")
155
156 )
157
158
159 ;;;
160 ;;; HTML syntax error messages.
161 ;;;
162
163 (eval-when-compile
164
165 (defvar w3-p-d-debug-url)
166 (put 'w3-p-d-debug-url 'variable-documentation
167 "Whether to print the URL being parsed before an error messages.
168 Only true for the first error message.")
169
170 ;; The level parameter indicates whether the error is (1) very
171 ;; serious, must be displayed to all users, (2) invalid HTML, but the
172 ;; user should only be told if the user has indicated interest, or (3)
173 ;; valid HTML which is bad because it appears to rely on the way certain
174 ;; browsers will display it, which should only be displayed to the user
175 ;; if they have really asked for it.
176
177 (defmacro w3-debug-html (&rest body)
178 "Emit a warning message.
179 These keywords may be used at the beginning of the arguments:
180 :mandatory-if sexp -- force printing if sexp evaluates non-nil.
181 :bad-style -- do not print unless w3-debug-html is 'style.
182 :outer -- do not include the current element in the element
183 context we report.
184 :nocontext -- do not include context where error detected.
185 The remaining parameters are treated as the body of a progn, the value of
186 which must be a string to use as the error message."
187 (let (mandatory-if bad-style outer nocontext condition)
188 (while (memq (car body) '(:mandatory-if :bad-style :outer :nocontext))
189 (cond ((eq ':mandatory-if (car body))
190 (setq mandatory-if (car (cdr body)))
191 (setq body (cdr (cdr body))))
192 ((eq ':bad-style (car body))
193 (setq bad-style t)
194 (setq body (cdr body)))
195 ((eq ':nocontext (car body))
196 (setq nocontext t)
197 (setq body (cdr body)))
198 ((eq ':outer (car body))
199 (setq outer t)
200 (setq body (cdr body)))))
201 (setq condition (if bad-style
202 '(eq 'style w3-debug-html)
203 'w3-debug-html))
204 (if mandatory-if
205 (setq condition
206 (` (or (, mandatory-if)
207 (, condition)))))
208 (` (if (, condition)
209 (let ((message (progn (,@ body))))
210 (if message
211 (w3-debug-html-aux message
212 (,@ (if nocontext
213 (list outer nocontext)
214 (if outer '(t)))))))))))
215
216 ;; This is unsatisfactory.
217 (put 'w3-debug-html 'lisp-indent-function 0)
218
219 (put 'w3-debug-html 'edebug-form-spec
220 '([&rest &or ":nocontext" ":outer" [":mandatory-if" form] ":bad-style"]
221 &rest form))
222 )
223
224 (defun w3-debug-html-aux (message &optional outer nocontext)
225 (let (
226 ;; We have already determined whether the user should see the
227 ;; message, so don't let w3-warn suppress it.
228 (w3-debug-html t))
229 ;; Print the URL before the first error message for a document.
230 (cond (w3-p-d-debug-url
231 (let ((url (url-view-url t)))
232 (w3-warn 'html
233 (if (or (null url)
234 (string-equal "" url))
235 (format "HTML errors for buffer %s"
236 (current-buffer))
237 (format "HTML errors for <URL:%s>" url))))
238 (setq w3-p-d-debug-url nil)))
239 (w3-warn 'html
240 (if nocontext
241 message
242 (concat message
243 ;; Display context information for each error
244 ;; message.
245 "\n Containing elements: "
246 (w3-open-elements-string (if outer 1))
247 (concat
248 "\n Text around error: "
249 (save-restriction
250 (widen)
251 (progn
252 (insert "*ERROR*")
253 (prog1
254 (w3-quote-for-string
255 (buffer-substring
256 (max (- (point) 27) (point-min))
257 (min (+ (point) 20) (point-max))))
258 (delete-char -7))))))))))
259
260 (defun w3-quote-for-string (string)
261 (save-excursion
262 (set-buffer (get-buffer-create " w3-quote-whitespace"))
263 (erase-buffer)
264 (insert string)
265 (goto-char (point-min))
266 (insert "\"")
267 (while (progn
268 (skip-chars-forward "^\"\\\t\n\r")
269 (not (eobp)))
270 (insert "\\" (cdr (assq (following-char) '((?\" . "\"")
271 (?\\ . "\\")
272 (?\t . "t")
273 (?\n . "n")
274 (?\r . "r")))))
275 (delete-char 1))
276 (insert "\"")
277 (buffer-string)))
278
279
280 ;;;
281 ;;; General entity references and numeric character references.
282 ;;;
283
284 ;; *** MULE conversion?
285 ;; *** I18N HTML support?
286
287 (let ((html-entities w3-html-entities))
288 (while html-entities
289 (put (car (car html-entities)) 'html-entity-expansion
290 (cons 'CDATA (if (integerp (cdr (car html-entities)))
291 (char-to-string (cdr (car html-entities)))
292 (cdr (car html-entities)))))
293 (setq html-entities (cdr html-entities))))
294
295 ;; These are handled differently than the normal HTML entities because
296 ;; we need to define the entities with 'nil instead of 'CDATA so
297 ;; that they are correctly scanned for new markup.
298 ;;
299 ;; from jbw@cs.bu.edu
300 ;;
301 ;;> Of course, this differs from the specification a bit. The W3C tech
302 ;;> report defines all of these as SYSTEM entities. This potentially means
303 ;;> that they can be used in more contexts. The method I outlined above
304 ;;> means "&smiley;" can only be used in contexts where IMG is a valid
305 ;;> element. I am not sure exactly where it is okay to use a SYSTEM entity.
306 ;;> I think anywhere that data characters are accepted.
307 ;;
308 ;; I find this acceptable, as just what the hell are you supposed to do with
309 ;; &computer; as part of a value of a form input when you display it and/or
310 ;; submit it?!
311
312 (let ((html-entities w3-graphic-entities)
313 (cur nil))
314 (while html-entities
315 (setq cur (car html-entities)
316 html-entities (cdr html-entities))
317 (put (nth 0 cur) 'html-entity-expansion
318 (cons 'nil (format "img src=\"%s/%s%s\" alt=\"%s\""
319 w3-icon-directory
320 (nth 1 cur)
321 (if w3-icon-format
322 (concat "." (symbol-name w3-icon-format))
323 "")
324 (or (nth 3 cur) (nth 2 cur)))))))
325
326 ;; These are the general entities in HTML 3.0 in terms of which the math
327 ;; shortrefs are defined:
328 ;;
329 ;; <!ENTITY REF1 STARTTAG "SUP">
330 ;; <!ENTITY REF2 ENDTAG "SUP">
331 ;; <!ENTITY REF3 STARTTAG "SUB">
332 ;; <!ENTITY REF4 ENDTAG "SUB">
333 ;; <!ENTITY REF5 STARTTAG "BOX">
334 ;; <!ENTITY REF6 ENDTAG "BOX">
335 ;;
336 ;; We're ignoring them because these names should really be local to the
337 ;; DTD and not visible in the document. They might change at any time in
338 ;; future HTML standards.
339
340 ;; <!--Entities for language-dependent presentation (BIDI and contextual analysis) -->
341 ;; <!ENTITY zwnj CDATA "&#8204;"-- zero width non-joiner-->
342 ;; <!ENTITY zwj CDATA "&#8205;"-- zero width joiner-->
343 ;; <!ENTITY lrm CDATA "&#8206;"-- left-to-right mark-->
344 ;; <!ENTITY rlm CDATA "&#8207;"-- right-to-left mark-->
345
346 ;; Entity names are case sensitive!
347
348 ;; & should only be recognized when followed by letter or # and
349 ;; digit or # and letter.
350
351 (eval-when-compile
352
353 (w3-p-s-var-def w3-p-s-entity)
354 (w3-p-s-var-def w3-p-s-pos)
355 (w3-p-s-var-def w3-p-s-num)
356 ;; Destroys free variables:
357 ;; w3-p-s-entity, w3-p-s-pos, w3-p-s-num
358 ;; Depends on case-fold-search being t.
359 (defsubst w3-expand-entity-at-point-maybe ()
360 ;; We are looking at a &.
361 ;; Only &A or &#1 or &#A syntax is special.
362 (cond
363 ((and (looking-at "&\\([a-z][-a-z0-9.]*\\)[\ ;\n]?") ; \n should be \r
364 ;; We are looking at a general entity reference, maybe undefined.
365 (setq w3-p-s-entity
366 (get
367 (intern (buffer-substring (match-beginning 1) (match-end 1)))
368 'html-entity-expansion)))
369
370 ;; If the reference was undefined, then for SGML, we should really
371 ;; issue a warning and delete the reference. However, the HTML
372 ;; standard (contradicting the SGML standard) says to leave the
373 ;; undefined reference in the text.
374
375 ;; We are looking at a defined general entity reference.
376 (replace-match "")
377 (cond ((eq 'CDATA (car w3-p-s-entity))
378 ;; Leave point after expansion so we don't rescan it.
379 (insert (cdr w3-p-s-entity)))
380 ((memq (car w3-p-s-entity) '(nil STARTTAG ENDTAG MS MD))
381 ;; nil is how I mark ordinary entities.
382 ;; The replacement text gets rescanned for all of these.
383 (setq w3-p-s-pos (point))
384 (insert (cdr (assq (car w3-p-s-entity)
385 '((nil . "")
386 (STARTTAG . "<")
387 (ENDTAG . "</")
388 (MS . "<![")
389 (MD . "<!"))))
390 (cdr w3-p-s-entity)
391 (cdr (assq (car w3-p-s-entity)
392 '((nil . "")
393 (STARTTAG . ">")
394 (ENDTAG . ">")
395 (MS . "]]>")
396 (MD . ">")))))
397 (goto-char w3-p-s-pos)
398 ;; *** Strictly speaking, if we parse anything from the
399 ;; replacement text, it must end before the end of the
400 ;; replacement text.
401 )
402 ((eq 'SDATA (car w3-p-s-entity))
403 (insert "[Unimplemented SDATA \"%s\"]" (cdr w3-p-s-entity)))
404 ((eq 'PI (car w3-p-s-entity))
405 ;; We are currently ignoring processing instructions.
406 ;; *** Strictly speaking, we should issue a warning if this
407 ;; occurs in a attribute value.
408 )
409 (t
410 ;; *** We don't handle external entities yet.
411 (error "[Unimplemented entity: \"%s\"]" w3-p-s-entity))))
412
413 ((looking-at "&#[0-9][0-9]*\\([\ ;\n]?\\)") ; \n should be \r
414 ;; We are looking at a numeric character reference.
415 ;; Ensure the number is already terminated by a semicolon or carriage
416 ;; return so we can use "read" to get it as a number quickly.
417 (cond ((= (match-beginning 1) (match-end 1))
418 ;; This is very uncommon, so we don't have to be quick here but
419 ;; rather correct.
420 (save-excursion
421 (goto-char (match-end 0)) ; same as match-end 1
422 (insert ?\;))
423 ;; Set up the match data properly
424 (looking-at "&#[0-9][0-9]*;")))
425 (forward-char 2)
426 (setq w3-p-s-num (read (current-buffer)))
427 ;; Always leave point after the expansion of a numeric
428 ;; character reference, like it were a CDATA entity.
429 (replace-match "")
430 ;; char-to-string will hopefully do something useful with characters
431 ;; larger than 255. I think in MULE it does. Is this true?
432 ;; Bill wants to call w3-resolve-numeric-entity here, but I think
433 ;; that functionality belongs in char-to-string.
434 ;; The largest valid character in the I18N version of HTML is 65533.
435 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt>
436 (insert (char-to-string w3-p-s-num)))
437
438 ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r
439 (replace-match (assq (upcase (char-after (+ 3 (point))))
440 '(;; *** Strictly speaking, record end should be
441 ;; carriage return.
442 (?E . "\n") ; RE
443 ;; *** And record start should be line feed.
444 (?S . "") ; RS
445 (?P . " ") ; SPACE
446 (?A . "\t")))) ; TAB
447 ;; Leave point after the expansion of a character reference, so it
448 ;; doesn't get rescanned.
449 ;; *** Strictly speaking, we should issue a warning for &#foo; if foo
450 ;; is not a function character in the SGML declaration.
451 )
452
453 ((eq ?& (following-char))
454 ;; We are either looking at an undefined reference or a & that does
455 ;; not start a reference (in which case we should not have been called).
456 ;; Skip over the &.
457 (forward-char 1))
458
459 (t
460 ;; What is the code doing calling us if we're not looking at a "&"?
461 (error "this should never happen"))))
462
463 )
464
465
466 ;;;
467 ;;; Syntax table used in markup declarations.
468 ;;;
469
470 (defvar w3-sgml-md-syntax-table
471 (let ((table (make-syntax-table))
472 (items '(
473 (0 "." 255) ; clear everything
474 (?\r " ")
475 (?\t " ")
476 (?\n " ")
477 (32 " ") ; space
478 (?< "\(>")
479 (?> "\)<")
480 (?\( "\(\)")
481 (?\) "\)\(")
482 (?\[ "\(\]")
483 (?\] "\)\[")
484 (?\" "\"")
485 (?\' "\"")
486 (?a "w" ?z)
487 (?A "w" ?Z)
488 (?0 "w" ?9)
489 (?. "w")
490 ;; "-" can be a character in a NAME, but it is also used in
491 ;; "--" as both a comment start and end within SGML
492 ;; declarations ("<!" ... ">"). In HTML, it is only used
493 ;; as a NAME character in the parameter entities
494 ;; Content-Type, HTTP-Method, and style-notations and in
495 ;; the attribute name http-equiv and in the notation names
496 ;; dsssl-lite and w3c-style. We would like to be able to
497 ;; train Emacs to skip over these kinds of comments with
498 ;; forward-sexp and backward-sexp. Is there any way to
499 ;; teach Emacs how to do this? It doesn't seem to be the
500 ;; case.
501 (?- "w")
502 )))
503 (while items
504 (let* ((item (car items))
505 (char (car item))
506 (syntax (car (cdr item)))
507 (bound (or (car-safe (cdr-safe (cdr item)))
508 char)))
509 (while (<= char bound)
510 (modify-syntax-entry char syntax table)
511 (setq char (1+ char))))
512 (setq items (cdr items)))
513 table)
514 "A syntax table for parsing SGML markup declarations.")
515
516
517 ;;;
518 ;;; Element information data type.
519 ;;;
520
521 ;; The element information data type is used in two ways:
522 ;;
523 ;; * To store the DTD, there is one element record for each element in
524 ;; the DTD.
525 ;;
526 ;; * To store information for open elements in the current parse tree.
527 ;; Each such element is initialized by copying the element record
528 ;; from the DTD. This means that values in the fields can not be
529 ;; destructively altered, although of course the fields can be
530 ;; changed.
531
532 ;; The cells in this vector are:
533 ;;
534 ;; name: the element's name (a generic identifier).
535 ;;
536 ;; end-tag-name: a symbol whose name should be the result of prefixing
537 ;; the generic-identifier with a slash. This is a convenience value for
538 ;; interfacing with the display engine which expects a stream of start
539 ;; and end tags in this format rather than a tree.
540 ;;
541 ;; content-model: a data structure describing what elements or character
542 ;; data we expect to find within this element. This is either a symbol
543 ;; listed here:
544 ;;
545 ;; EMPTY: no content, no end-tag allowed.
546 ;; CDATA: all data characters until "</[a-z]" is seen.
547 ;; XCDATA: special non-SGML-standard mode which includes all data
548 ;; characters until "</foo" is seen where "foo" is the name of this
549 ;; element. (for XMP and LISTING)
550 ;; XXCDATA: special non-SGML-standard mode which includes all data
551 ;; until end-of-entity (end-of-buffer for us). (for PLAINTEXT)
552 ;; RCDATA: all data characters until "</[a-z]" is seen, except that
553 ;; entities are expanded first, although the expansions are not
554 ;; scanned for end-tags.
555 ;; XINHERIT: special non-SGML-standard mode which means to use the
556 ;; content model of the containing element instead.
557 ;;
558 ;; or a vector of this structure:
559 ;;
560 ;; [(INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT) ...]
561 ;;
562 ;; where INCLUDES is of the format:
563 ;;
564 ;; (TAG ...)
565 ;;
566 ;; where each TRANSITION is one of these:
567 ;;
568 ;; (ACTION NEW-STATE ERRORP)
569 ;; (ACTION NEW-STATE)
570 ;; (ACTION)
571 ;;
572 ;; where DEFAULT is one of these:
573 ;;
574 ;; nil or TRANSITION
575 ;;
576 ;; where the meaning of the components is:
577 ;;
578 ;; INCLUDES is a list of tags for which the transition (*include *same
579 ;; nil) applies.
580 ;;
581 ;; DEFAULT if non-nil is a transition that should be taken when
582 ;; matching any possibility not explicitly listed in another
583 ;; TRANSITION, except for data characters containing only whitespace.
584 ;;
585 ;; INCSPACEP specifies how to handle data characters which include
586 ;; only whitespace characters. The value is non-nil to indicate
587 ;; (*include *same nil) or nil to indicate (*discard *same nil).
588 ;;
589 ;; TAG is a symbol corresponding to the start-tag we are looking at,
590 ;; or *data when seeing character data that includes at least one
591 ;; non-space character.
592 ;;
593 ;; ACTION is one of:
594 ;; *close: Close this element and try again using content model of
595 ;; enclosing element. (Note that this does not apply to the
596 ;; case of an element being closed by its own end-tag.)
597 ;; *include: Process new element as subelement of this one or
598 ;; include data characters directly.
599 ;; *discard: Discard a start-tag or data characters.
600 ;; *retry: Try again after processing NEW-STATE and ERRORP.
601 ;; ELEMENT: Open ELEMENT (with default attributes), then try again
602 ;; using its content model.
603 ;;
604 ;; NEW-STATE (optional, default *same) is the index of the state to
605 ;; move to after processing the element or one of these:
606 ;; *same: no state change occurs.
607 ;; *next: change the the current state + 1.
608 ;; The initial state is 0. NEW-STATE does not matter if ACTION is
609 ;; *close.
610 ;;
611 ;; ERRORP (optional, default nil) if non-nil indicates this transition
612 ;; represents an error. The error message includes this value if it
613 ;; is a string.
614 ;;
615 ;; If no matching transition is found, the default transition is
616 ;; (*discard *same "not allowed here").
617 ;;
618 ;; overrides: An alist of pairs of the form (VAR REPLACEP . VALUE).
619 ;; When this element is opened, the old value of VAR is saved in the
620 ;; undo-list. If REPLACEP is non-nil, then VAR gets value VALUE,
621 ;; otherwise VAR gets value (append VALUE (symbol-value VAR)). Useful
622 ;; values for VAR are:
623 ;;
624 ;; w3-p-d-exceptions: See doc string.
625 ;;
626 ;; w3-p-d-shortrefs: See doc string.
627 ;;
628 ;; w3-p-d-shortref-chars: See doc string.
629 ;;
630 ;; end-tag-omissible: Whether it is legal to omit the end-tag of this
631 ;; element. If an end-tag is inferred for an element whose end tag is
632 ;; not omissible, an error message is given.
633 ;;
634 ;; state: The current state in the content model. Preset to the initial
635 ;; state of 0.
636 ;;
637 ;; undo-list: an alist of of former values of local variables
638 ;; of w3-parse-buffer to restore upon closing this element. Each
639 ;; item on the list is of the format (VAR . VALUE-TO-RESTORE).
640 ;;
641 ;; attributes: an alist of attributes and values. Each item on
642 ;; this list is of the format (ATTRIBUTE-NAME . VALUE). Each
643 ;; ATTRIBUTE-NAME is a symbol and each attribute value is a
644 ;; string.
645 ;;
646 ;; content: a list of the accumulated content of the element. While the
647 ;; element is open, the list is in order from latest to earliest,
648 ;; otherwise it is in order from earliest to latest. Each member is
649 ;; either a string of data characters or a list of the form (NAME
650 ;; ATTRIBUTES CONTENT), where NAME is the subelement's name, ATTRIBUTES
651 ;; is an alist of the subelement's attribute names (lowercase symbols)
652 ;; and their values (strings), and CONTENT is the subelement's content.
653
654 (eval-when-compile
655
656 (defconst w3-element-fields
657 '(name end-tag-name content-model state overrides undo-list
658 content attributes end-tag-omissible deprecated))
659
660 (let* ((fields w3-element-fields)
661 (index (1- (length fields))))
662 (while fields
663 (let* ((field (symbol-name (car fields)))
664 (get-sym (intern (concat "w3-element-" field)))
665 (set-sym (intern (concat "w3-set-element-" field))))
666 (eval (` (progn
667 (defmacro (, get-sym) (element)
668 (list 'aref element (, index)))
669 (defmacro (, set-sym) (element value)
670 (list 'aset element (, index) value))))))
671 (setq fields (cdr fields))
672 (setq index (1- index))))
673
674 (defmacro w3-make-element ()
675 (list 'make-vector (length w3-element-fields) nil))
676
677 ;; *** move this to be with DTD declaration.
678 (defmacro w3-fresh-element-for-tag (tag)
679 (` (copy-sequence
680 (or (get (, tag) 'html-element-info)
681 (error "unimplemented element %s"
682 (w3-sgml-name-to-string (, tag)))))))
683
684 ;; *** move this to be with DTD declaration.
685 (defmacro w3-known-element-p (tag)
686 (` (get (, tag) 'html-element-info)))
687
688 (defsubst w3-sgml-name-to-string (sym)
689 (upcase (symbol-name sym)))
690
691 )
692
693
694 ;;;
695 ;;; Parse tree manipulation.
696 ;;;
697
698 ;; ;; Find the name of the previous element or a substring of the
699 ;; ;; preceding data characters.
700 ;; (let ((content (w3-element-content (car stack))))
701 ;; (while content
702 ;; (cond
703 ;; ((and (stringp (car content))
704 ;; (not (string-match "\\`[ \t\n\r]*\\'" (car content))))
705 ;; (setq prior-item (car content))
706 ;; ;; Trim trailing whitespace
707 ;; (if (string-match "\\(.*[^ \t\n\r]\\)[ \t\n\r]*\\'" prior-item)
708 ;; (setq prior-item (substring prior-item 0 (match-end 1))))
709 ;; (if (> (length prior-item) 8)
710 ;; (setq prior-item (concat "..." (substring prior-item -8))))
711 ;; (setq prior-item (w3-quote-for-string prior-item))
712 ;; (setq prior-item (concat "\(after " prior-item "\)"))
713 ;; (setq content nil))
714 ;; ((and (consp (car content))
715 ;; (symbolp (car (car content))))
716 ;; (setq prior-item
717 ;; (concat "\(after "
718 ;; (w3-sgml-name-to-string (car (car content)))
719 ;; "\)"))
720 ;; (setq content nil))
721 ;; (t
722 ;; (setq content (cdr content))))))
723
724 ;; Only used for HTML debugging.
725 (defun w3-open-elements-string (&optional skip-count)
726 (let* ((stack (nthcdr (or skip-count 0)
727 (cons w3-p-d-current-element
728 w3-p-d-open-element-stack)))
729 ;;(prior-item "(at start)")
730 result)
731 ;; Accumulate the names of the enclosing elements.
732 (while stack
733 (let ((element (w3-element-name (car stack))))
734 (if (eq '*holder element)
735 nil
736 ;; Only include *DOCUMENT if there are no other elements.
737 (if (or (not (eq '*document element))
738 (null result))
739 (setq result (cons (w3-sgml-name-to-string element)
740 result)))))
741 (setq stack (cdr stack)))
742 (setq result (mapconcat 'identity result ":"))
743 (if result
744 ;;(concat
745 result
746 ;; prior-item)
747 "[nowhere!]")))
748
749 ;; *** This doesn't really belong here, but where?
750 (eval-when-compile
751 (defmacro w3-invalid-sgml-chars ()
752 "Characters not allowed in an SGML document using the reference
753 concrete syntax (i.e. HTML). Returns a string in the format expected by
754 skip-chars-forward."
755 "\000-\010\013\014\016-\037\177-\237"))
756
757 (eval-when-compile
758 ;; Uses:
759 ;; w3-p-d-null-end-tag-enabled, w3-p-d-in-parsed-marked-section,
760 ;; w3-p-d-shortref-chars
761 ;; Modifies free variable:
762 ;; w3-p-d-non-markup-chars
763 (defsubst w3-update-non-markup-chars ()
764 (setq w3-p-d-non-markup-chars
765 (concat "^&<"
766 (w3-invalid-sgml-chars)
767 (if w3-p-d-null-end-tag-enabled "/" "")
768 (if w3-p-d-in-parsed-marked-section "]" "")
769 (or w3-p-d-shortref-chars ""))))
770
771 ;; Modifies free variable:
772 ;; w3-p-d-parse-tag-stream-tail-pointer
773 (defsubst w3-add-display-item (tag value)
774 (setcdr w3-p-d-parse-tag-stream-tail-pointer
775 (list (cons tag value)))
776 (setq w3-p-d-parse-tag-stream-tail-pointer
777 (cdr w3-p-d-parse-tag-stream-tail-pointer)))
778
779 )
780
781 (eval-when-compile
782 (w3-p-s-var-def w3-p-s-overrides)
783 (w3-p-s-var-def w3-p-s-undo-list)
784 (w3-p-s-var-def w3-p-s-var)
785 ;; Uses free variables:
786 ;; w3-p-d-non-markup-chars
787 ;; Modifies free variables:
788 ;; w3-p-d-current-element, w3-p-d-open-element-stack
789 ;; Destroys free variables:
790 ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var
791 (defsubst w3-open-element (tag attributes)
792
793 ;; Send trailing data character item in the old current element to
794 ;; display engine.
795 (if (stringp (car-safe (w3-element-content w3-p-d-current-element)))
796 (w3-add-display-item
797 'text
798 (car-safe (w3-element-content w3-p-d-current-element))))
799
800 ;; Push new element on stack.
801 (setq w3-p-d-open-element-stack (cons w3-p-d-current-element
802 w3-p-d-open-element-stack))
803 (setq w3-p-d-current-element (w3-fresh-element-for-tag tag))
804
805 ;; Warn if deprecated or obsolete.
806 (if (w3-element-deprecated w3-p-d-current-element)
807 (w3-debug-html :outer
808 (format "%s element %s."
809 (if (eq 'obsolete
810 (w3-element-deprecated w3-p-d-current-element))
811 "Obsolete"
812 "Deprecated")
813 (w3-sgml-name-to-string
814 (w3-element-name w3-p-d-current-element)))))
815
816 ;; Store attributes.
817 ;; *** we are not handling #CURRENT attributes (HTML has none).
818 (w3-set-element-attributes w3-p-d-current-element attributes)
819 ;; *** Handle default attribute values.
820 ;; *** Fix the attribute name for unnamed values. Right now they will
821 ;; be in the attribute list as items of the format (VALUE . VALUE) where
822 ;; both occurrences of VALUE are the same. The first one needs to be
823 ;; changed to the proper attribute name by consulting the DTD.
824 ;; ********************
825
826 ;; Handle syntax/semantics overrides of new current element.
827 (cond ((w3-element-overrides w3-p-d-current-element)
828 (setq w3-p-s-overrides
829 (w3-element-overrides w3-p-d-current-element))
830 (setq w3-p-s-undo-list nil)
831 (while w3-p-s-overrides
832 (setq w3-p-s-var (car (car w3-p-s-overrides)))
833 (setq w3-p-s-undo-list
834 (cons (cons w3-p-s-var
835 (symbol-value w3-p-s-var))
836 w3-p-s-undo-list))
837 (set w3-p-s-var (if (car (cdr (car w3-p-s-overrides)))
838 (cdr (cdr (car w3-p-s-overrides)))
839 (append (cdr (cdr (car w3-p-s-overrides)))
840 (symbol-value w3-p-s-var))))
841 ;; *** HACK HACK.
842 ;; Magic handling of w3-p-d-shortref-chars.
843 (cond ((eq 'w3-p-d-shortref-chars w3-p-s-var)
844 (setq w3-p-s-undo-list
845 (cons (cons 'w3-p-d-non-markup-chars
846 w3-p-d-non-markup-chars)
847 w3-p-s-undo-list))
848 (w3-update-non-markup-chars)))
849 (setq w3-p-s-overrides (cdr w3-p-s-overrides)))
850 (w3-set-element-undo-list w3-p-d-current-element
851 w3-p-s-undo-list)))
852
853 ;; Handle content-model inheritance. (Very non-SGML!)
854 (if (eq 'XINHERIT (w3-element-content-model w3-p-d-current-element))
855 (w3-set-element-content-model
856 w3-p-d-current-element
857 (w3-element-content-model (car w3-p-d-open-element-stack))))
858
859 ;; Send the start-tag and attributes to the display engine.
860 (if (memq tag '(plaintext style xmp textarea))
861 ;; Garbage special-casing for old display engine.
862 ;; Nothing is sent until end-tag is found.
863 ;; The DTD will ensure no subelements of these elements.
864 nil
865 ;; Normal procedure.
866 (w3-add-display-item tag attributes)))
867 )
868
869 ;; The protocol for handing items to the display engine is as follows.
870 ;;
871 ;; For an element, send (START-TAG . ATTS), each member of the content,
872 ;; and (END-TAG . nil) if the element is allowed to have an end tag.
873 ;;
874 ;; For data characters, send (text . DATA-CHARACTERS).
875 ;;
876 ;; Exceptions:
877 ;;
878 ;; For PLAINTEXT, STYLE, XMP, TEXTAREA send:
879 ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTS)).
880 ;;
881 ;; *** This requires somehow eliminating any subelements of the TEXTAREA
882 ;; element. TEXTAREA can contain subelements in HTML 3.0.
883 ;;
884 ;; For LISTING, send (text . DATA-CHARACTERS). (Is this really correct or
885 ;; is this perhaps a bug in the old parser?) I'm ignoring this for now.
886
887 (eval-when-compile
888 (w3-p-s-var-def w3-p-s-undo-list)
889 (w3-p-s-var-def w3-p-s-content)
890 (w3-p-s-var-def w3-p-s-end-tag)
891 ;; Modifies free variables:
892 ;; w3-p-d-current-element, w3-p-d-open-element-stack
893 ;; Accesses free variables:
894 ;; w3-p-d-tag-name, w3-p-d-end-tag-p
895 ;; Destroys free variables:
896 ;; w3-p-s-undo-list, w3-p-s-content, w3-p-s-end-tag
897 (defsubst w3-close-element (&optional inferred)
898 ;; inferred: non-nil if the end-tag of the current element is being
899 ;; inferred due to the presence of content not allowed in the current
900 ;; element. If t, then the tag causing this is in w3-p-d-tag-name and
901 ;; w3-p-d-end-tag-p.
902 ;; (OLD: ... otherwise it is a symbol indicating the start-tag
903 ;; of an element or *data or *space indicating data characters.)
904
905 (cond ((and inferred
906 (not (w3-element-end-tag-omissible w3-p-d-current-element)))
907 (w3-debug-html
908 (format "</%s> end-tag not omissible (required due to %s)"
909 (w3-sgml-name-to-string
910 (w3-element-name w3-p-d-current-element))
911 (cond ((eq t inferred)
912 (format (if w3-p-d-end-tag-p
913 "</%s> end-tag"
914 "start-tag for %s")
915 (w3-sgml-name-to-string
916 w3-p-d-tag-name)))
917 ;; *** Delete this functionality?
918 ((memq inferred '(*space *data))
919 "data characters")
920 ((symbolp inferred)
921 (format "start-tag for %s"
922 (w3-sgml-name-to-string inferred)))
923 )))))
924
925 ;; Undo any variable bindings of this element.
926 (cond ((w3-element-undo-list w3-p-d-current-element)
927 (setq w3-p-s-undo-list
928 (w3-element-undo-list w3-p-d-current-element))
929 (while w3-p-s-undo-list
930 (set (car (car w3-p-s-undo-list))
931 (cdr (car w3-p-s-undo-list)))
932 (setq w3-p-s-undo-list (cdr w3-p-s-undo-list)))))
933
934 (setq w3-p-s-end-tag
935 (w3-element-end-tag-name w3-p-d-current-element))
936
937 ;; Fix up the content of the current element in preparation for putting
938 ;; it in the parent.
939 ;; Remove trailing newline from content, if there is one, otherwise send
940 ;; any trailing data character item to display engine.
941 (setq w3-p-s-content (w3-element-content w3-p-d-current-element))
942 (cond ((null w3-p-s-content))
943 ((equal "\n" (car w3-p-s-content))
944 (setq w3-p-s-content (cdr w3-p-s-content)))
945 ((and (stringp (car w3-p-s-content))
946 ;; Garbage special-casing for old display engine.
947 (not (memq w3-p-s-end-tag
948 '(/plaintext /style /xmp /textarea))))
949 (w3-add-display-item 'text (car w3-p-s-content))))
950
951 ;; Send the end-tag to the display engine, but only if the element is
952 ;; allowed to have an end tag.
953 (cond ((memq w3-p-s-end-tag '(/plaintext /style /xmp /textarea))
954 ;; Garbage special-casing for old display engine.
955 ;; Format old display engine expects for these elements:
956 ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTRIBUTES))
957 (w3-add-display-item
958 ;; Use the *start*-tag, not the end-tag.
959 (w3-element-name w3-p-d-current-element)
960 (cons (cons 'data
961 (condition-case nil
962 (mapconcat 'identity w3-p-s-content "")
963 (error "eeek! subelement content!")))
964 (w3-element-attributes w3-p-d-current-element))))
965 ;; *** Handle LISTING the way the old parser did.
966 ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element))
967 ;; Do nothing, can't have an end tag.
968 )
969 (t
970 ;; Normal case.
971 (w3-add-display-item w3-p-s-end-tag nil)
972 (if (null w3-p-s-content)
973 (w3-debug-html
974 :bad-style :outer
975 ;; Don't warn for empty TD elements or empty A elements
976 ;; with no HREF attribute.
977 ;; *** Crude hack that should really be encoded in the
978 ;; element database somehow.
979 (if (or (not (memq (w3-element-name w3-p-d-current-element)
980 '(a td)))
981 (assq 'href
982 (w3-element-attributes w3-p-d-current-element)))
983 (format "Empty %s element."
984 (w3-sgml-name-to-string
985 (w3-element-name w3-p-d-current-element))))))))
986
987 ;; Put the current element in the proper place in its parent.
988 ;; This will cause an error if we overpop the stack.
989 (w3-set-element-content
990 (car w3-p-d-open-element-stack)
991 (cons (list (w3-element-name w3-p-d-current-element)
992 (w3-element-attributes w3-p-d-current-element)
993 (nreverse w3-p-s-content))
994 (w3-element-content (car w3-p-d-open-element-stack))))
995
996 ;; Pop the stack.
997 (setq w3-p-d-current-element (car w3-p-d-open-element-stack))
998 (setq w3-p-d-open-element-stack (cdr w3-p-d-open-element-stack)))
999
1000 )
1001
1002
1003 ;;;
1004 ;;; A pseudo-DTD for HTML.
1005 ;;;
1006
1007 (eval-when-compile
1008 ;; This works around the following bogus compiler complaint:
1009 ;; While compiling the end of the data in file w3-parse.el:
1010 ;; ** the function w3-expand-parameters is not known to be defined.
1011 ;; This is a bogus error. Anything of this form will trigger this message:
1012 ;; (eval-when-compile (defun xyzzy () (xyzzy)))
1013 (defun w3-expand-parameters (pars data) nil))
1014
1015 (eval-when-compile
1016 (defun w3-expand-parameters (pars data)
1017 (cond ((null data)
1018 nil)
1019 ((consp data)
1020 ;; This has to be written carefully to avoid exceeding the
1021 ;; maximum lisp function call nesting depth.
1022 (let (result)
1023 (while (consp data)
1024 (let ((car-exp (w3-expand-parameters pars (car data))))
1025 (setq result
1026 (if (and (symbolp (car data))
1027 (not (eq car-exp (car data)))
1028 ;; An expansion occurred.
1029 (listp car-exp))
1030 ;; The expansion was a list, which we splice in.
1031 (condition-case err
1032 (append (reverse car-exp) result)
1033 (wrong-type-argument
1034 (if (eq 'listp (nth 1 err))
1035 ;; Wasn't really a "list" since the last
1036 ;; cdr wasn't nil, so don't try to splice
1037 ;; it in.
1038 (cons car-exp result)
1039 (signal (car err) (cdr err)))))
1040 (cons car-exp result))))
1041 (setq data (cdr data)))
1042 (append (nreverse result)
1043 (w3-expand-parameters pars data))))
1044 ((symbolp data)
1045 (let ((sym-exp (cdr-safe (assq data pars))))
1046 (if sym-exp
1047 (w3-expand-parameters pars sym-exp)
1048 data)))
1049 ((vectorp data)
1050 (let ((i 0)
1051 (result (copy-sequence data)))
1052 (while (< i (length data))
1053 (aset result i
1054 (w3-expand-parameters pars (aref data i)))
1055 (setq i (1+ i)))
1056 result))
1057 (t
1058 data))))
1059
1060 (eval-when-compile
1061 (defun w3-unfold-dtd (items)
1062 (let (result)
1063 (while items
1064 (let* ((item (car items))
1065 (names (car item))
1066 (content-model
1067 (or (cdr-safe (assq 'content-model item))
1068 (error "impossible")))
1069 (overrides (cdr-safe (assq 'overrides item)))
1070 (end-tag-omissible
1071 (or (cdr-safe (assq 'end-tag-omissible item))
1072 ;; *** Is this SGML standard?
1073 (eq 'EMPTY content-model)))
1074 (deprecated (cdr-safe (assq 'deprecated item)))
1075 element
1076 name)
1077 (while names
1078 (setq name (car names))
1079 (setq names (cdr names))
1080
1081 ;; Create and initialize the element information data
1082 ;; structure.
1083 (setq element (w3-make-element))
1084 (w3-set-element-name element name)
1085 (w3-set-element-end-tag-name
1086 element
1087 (intern (concat "/" (symbol-name name))))
1088 (w3-set-element-state element 0)
1089 (w3-set-element-content-model element content-model)
1090 (w3-set-element-end-tag-omissible element end-tag-omissible)
1091
1092 (or (memq deprecated '(nil t obsolete))
1093 (error "impossible"))
1094 (w3-set-element-deprecated element deprecated)
1095
1096 ;; Inclusions and exclusions are specified differently in the
1097 ;; human-coded DTD than in the format the implementation uses.
1098 ;; The human-coded version is designed to be easy to edit and to
1099 ;; work with w3-expand-parameters while the internal version is
1100 ;; designed to be fast. We have to translate here. This work
1101 ;; is repeated for every element listed in `names' so that the
1102 ;; exclusion exception error messages can be accurate.
1103 (let ((inclusions (cdr-safe (assq 'inclusions item)))
1104 (exclusions (cdr-safe (assq 'exclusions item)))
1105 (exclusion-mode '*close)
1106 (exclusion-message
1107 (format "%s exclusion" (w3-sgml-name-to-string name)))
1108 exceptions)
1109 (while inclusions
1110 (setq exceptions (cons (cons (car inclusions)
1111 '(*include *same nil))
1112 exceptions))
1113 (setq inclusions (cdr inclusions)))
1114 (while exclusions
1115 (cond ((memq (car exclusions) '(*discard *include *close))
1116 (setq exclusion-mode (car exclusions)))
1117 ((stringp (car exclusions))
1118 (setq exclusion-message (car exclusions)))
1119 (t
1120 (setq exceptions (cons (list (car exclusions)
1121 exclusion-mode
1122 '*same
1123 exclusion-message)
1124 exceptions))))
1125 (setq exclusions (cdr exclusions)))
1126 (let ((overrides (if exceptions
1127 (cons (cons 'w3-p-d-exceptions
1128 (cons nil exceptions))
1129 overrides)
1130 overrides)))
1131 (w3-set-element-overrides element overrides)))
1132
1133 (setq result (cons (cons name element) result))))
1134 (setq items (cdr items)))
1135 result)))
1136
1137 ;; Load the HTML DTD.
1138 ;; <URL:ftp://ds.internic.net/rfc/rfc1866.txt>
1139 ;; *** Be sure to incorporate rfc1867 when attribute-checking is added.
1140 ;; *** Write function to check sanity of the content-model forms.
1141 ;; *** I18N: Add Q, BDO, SPAN
1142 (mapcar
1143 (function
1144 (lambda (pair)
1145 (put (car pair) 'html-element-info (cdr pair))))
1146 ;; The purpose of this complexity is to speed up loading by
1147 ;; pre-evaluating as much as possible at compile time.
1148 (eval-when-compile
1149 (w3-unfold-dtd
1150 (w3-expand-parameters
1151 '(
1152 (%headempty . (link base meta range))
1153 (%head-deprecated . (nextid))
1154
1155 ;; client-side imagemaps
1156 (%imagemaps . (area map))
1157 ;; special action is taken for %text inside %body.content in the
1158 ;; content model of each element.
1159 (%body.content . (%heading %block hr div address %imagemaps))
1160
1161 (%heading . (h1 h2 h3 h4 h5 h6))
1162
1163 ;; Netscape's CENTER, FONT, and BASEFONT are handled
1164 ;; non-standardly. In actual psuedo-HTML on the net, these are
1165 ;; used as both text-level constructs and block-level constructs.
1166 ;; They are the only items in both %block and %text in this
1167 ;; definition here.
1168 ;; *** Perhaps add BLINK here too? But no one uses that as a block
1169 ;; construct. What about NOBR?
1170 (%netscape-crud . (center font basefont))
1171
1172 ;; Emacs-w3 extensions
1173 (%emacsw3-crud-nonempty . (roach secret wired))
1174 (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek))
1175
1176 (%block . (p %list dl form %preformatted %netscape-crud font
1177 %blockquote isindex fn table fig note
1178 %block-deprecated %block-obsoleted))
1179 (%list . (ul ol))
1180 (%preformatted . (pre))
1181 (%blockquote . (bq))
1182 (%block-deprecated . (dir menu blockquote))
1183 (%block-obsoleted . (xmp listing))
1184
1185 ;; Why is IMG in this list?
1186 (%pre.exclusion . (*include img *discard tab math big small sub sup))
1187
1188 (%text . (*data b %notmath %netscape-crud sub sup
1189 %emacsw3-crud %emacsw3-crud-nonempty))
1190 (%notmath . (%special %font %phrase %misc))
1191 (%font . (i u s strike tt big small)) ; B left out for MATH handling
1192 (%phrase . (em strong code samp kbd var cite blink))
1193 ;; Don't know if this is right place for EMBED.
1194 (%special . (a img br wbr nobr tab math embed))
1195 (%misc . (q lang au dfn person acronym abbrev ins del))
1196
1197 (%formula . (*data %math))
1198 (%math . (box above below %mathvec root sqrt array sub sup
1199 %mathface))
1200 (%mathvec . (vec bar dot ddot hat tilde))
1201 (%mathface . (b t bt))
1202
1203 (%mathdelims . (over atop choose left right of))
1204
1205 ;; What the hell? This takes BODYTEXT????? No way!
1206 (%bq-content-model . [(nil
1207 nil
1208 (((bodytext) *include *next))
1209 (bodytext *next))
1210 (nil
1211 nil
1212 (((credit) *include *next))
1213 nil)
1214 (nil nil nil nil)
1215 ])
1216
1217 ;; non-default bad HTML handling.
1218 (%in-text-ignore . ((p %heading) *discard *same error))
1219 )
1220 '(
1221 ;; A dummy element that will contain *document.
1222 ((*holder)
1223 (content-model . [(nil nil nil nil)]))
1224 ;; The root of the parse tree. We start with a pseudo-element
1225 ;; named *document for convenience.
1226 ((*document)
1227 (content-model . [(nil nil (((html) *include *next)) (html *next))
1228 (nil
1229 nil
1230 nil
1231 (*include *same "after document end"))])
1232 (end-tag-omissible . t))
1233 ;; HTML O O (HEAD, BODY)
1234 ((html)
1235 (content-model . [(nil
1236 nil
1237 (((head) *include *next))
1238 (head *next))
1239 (nil
1240 nil
1241 (((body) *include *next)
1242 ;; Netscape stuff
1243 ((frameset) *include 4)
1244 )
1245 (body *next))
1246 (nil
1247 nil
1248 (((plaintext) *include *next))
1249 (*retry *next))
1250 (nil
1251 nil
1252 nil
1253 (*include *same "after BODY"))
1254 (nil
1255 nil
1256 nil
1257 (*include *same "after FRAMESET"))
1258 ])
1259 (end-tag-omissible . t))
1260 ((head)
1261 (content-model . [((title isindex %headempty style %head-deprecated)
1262 nil
1263 nil
1264 ;; *** Should only close if tag can
1265 ;; legitimately follow head. So many can that
1266 ;; I haven't bothered to enumerate them.
1267 (*close))])
1268 (end-tag-omissible . t))
1269 ;; TITLE - - (#PCDATA)
1270 ((title)
1271 (content-model . RCDATA ; not official
1272 ;; [((*data) include-space nil nil)]
1273 ))
1274 ;; STYLE - O (#PCDATA)
1275 ;; STYLE needs to be #PCDATA to allow omitted end tag. Bleagh.
1276 ((style)
1277 (content-model . [((*data)
1278 include-space
1279 nil
1280 ;; *** Should only close if tag can
1281 ;; legitimately follow style. So many can that
1282 ;; I haven't bothered to enumerate them.
1283 (*close))])
1284 (end-tag-omissible . t))
1285 ((body)
1286 (content-model . [((banner) nil nil (*retry *next))
1287 ((bodytext) nil nil (bodytext *next))
1288 (nil nil (((plaintext) *close)) nil)])
1289 (inclusions . (spot))
1290 (end-tag-omissible . t))
1291 ;; Do I really want to include BODYTEXT? It has something to do
1292 ;; with mixed content screwing things up, and I don't understand
1293 ;; it. Wait! It's used by BQ!
1294 ((bodytext)
1295 (content-model . [((%body.content)
1296 nil
1297 ;; Push <P> before data characters. Non-SGML.
1298 (((%text) p)
1299 ;; Closing when seeing CREDIT is a stupidity
1300 ;; caused by BQ's sharing of BODYTEXT. BQ
1301 ;; should have its own BQTEXT.
1302 ((credit plaintext) *close))
1303 nil)])
1304 (end-tag-omissible . t))
1305 ((div banner)
1306 (content-model . [((%body.content)
1307 nil
1308 ;; Push <P> before data characters. Non-SGML.
1309 (((%text) p))
1310 nil)]))
1311 ((address)
1312 (content-model . [((p)
1313 nil
1314 ;; Push <P> before data characters. Non-SGML.
1315 (((%text) p))
1316 nil)]))
1317 ((%heading)
1318 (content-model . [((%text)
1319 include-space
1320 ((%in-text-ignore))
1321 nil)]))
1322 ((p)
1323 (content-model . [((%text)
1324 include-space
1325 nil
1326 ;; *** Should only close if tag can
1327 ;; legitimately follow P. So many can that I
1328 ;; don't bother to enumerate here.
1329 (*close))])
1330 (end-tag-omissible . t))
1331 ((ul ol)
1332 (content-model . [((lh)
1333 nil
1334 (((li) *include *next))
1335 (*retry *next))
1336 ((li)
1337 nil
1338 ;; Push <LI> before data characters or block
1339 ;; elements.
1340 ;; Non-SGML.
1341 (((%text %block) li *same error))
1342 nil)]))
1343 ((lh)
1344 (content-model . [((%text)
1345 include-space
1346 (((dd dt li) *close)
1347 (%in-text-ignore))
1348 nil)])
1349 (end-tag-omissible . t))
1350 ((dir menu)
1351 (content-model . [((li)
1352 nil
1353 (((%text) li *same error))
1354 nil)])
1355 (exclusions . (%block))
1356 (deprecated . t))
1357 ((li)
1358 (content-model . [((%block)
1359 nil
1360 (((li) *close)
1361 ;; Push <P> before data characters. Non-SGML.
1362 ((%text) p))
1363 nil)])
1364 (end-tag-omissible . t)
1365 ;; Better bad HTML handling.
1366 ;; Technically, there are a few valid documents that this will
1367 ;; hose, because you can have H1 inside FORM inside LI. However,
1368 ;; I don't think that should be allowed anyway.
1369 (exclusions . (*discard "not allowed here" %heading)))
1370 ((dl)
1371 (content-model . [((lh)
1372 nil
1373 (((dt dd) *include *next))
1374 (*retry *next))
1375 ((dt dd)
1376 nil
1377 ;; Push <DD> before data characters or block
1378 ;; items.
1379 ;; Non-SGML.
1380 (((%text %block) dd *same error))
1381 nil)]))
1382 ((dt)
1383 (content-model . [((%text)
1384 include-space
1385 (((dd dt) *close)
1386 (%in-text-ignore))
1387 nil)])
1388 (end-tag-omissible . t))
1389 ;; DD is just like LI, but we treat it separately because it can be
1390 ;; followed by a different set of elements.
1391 ((dd)
1392 (content-model . [((%block)
1393 nil
1394 (((dt dd) *close)
1395 ;; Push <P> before data characters. Non-SGML.
1396 ((%text) p))
1397 nil)])
1398 (end-tag-omissible . t)
1399 ;; See comment with LI.
1400 (exclusions . (*discard "not allowed here" %heading)))
1401 ((pre)
1402 (content-model . [((%text hr)
1403 include-space
1404 ((%in-text-ignore))
1405 nil)])
1406 (exclusions . (%pre.exclusion)))
1407 ;; BLOCKQUOTE deprecated, BQ okay
1408 ((bq)
1409 (content-model . %bq-content-model))
1410 ((blockquote)
1411 (content-model . %bq-content-model)
1412 ;; BLOCKQUOTE is deprecated in favor of BQ in the HTML 3.0 DTD.
1413 ;; However, BQ is not even mentioned in the HTML 2.0 DTD. So I
1414 ;; don't think we can enable this yet:
1415 ;;(deprecated . t)
1416 )
1417 ((fn note)
1418 (content-model . [((%body.content)
1419 nil
1420 ;; Push <P> before data characters. Non-SGML.
1421 (((%text) p))
1422 nil)]))
1423 ((fig)
1424 (content-model . [((overlay) nil nil (*retry *next))
1425 (nil
1426 nil
1427 (((caption) *include *next))
1428 (*retry *next))
1429 (nil
1430 nil
1431 (((figtext) *include *next)
1432 ((credit) *retry *next))
1433 ;; *** Should only do this for elements that
1434 ;; can be in FIGTEXT.
1435 (figtext *next))
1436 (nil nil (((credit) *include *next)) nil)
1437 (nil nil nil nil)]))
1438 ((caption credit)
1439 (content-model . [((%text)
1440 nil
1441 ((%in-text-ignore))
1442 nil)]))
1443 ((figtext)
1444 (content-model . [((%body.content)
1445 nil
1446 ;; Push <P> before data characters. Very non-SGML.
1447 (((%text) p)
1448 ((credit) *close))
1449 nil)])
1450 (end-tag-omissible . t))
1451 ((%emacsw3-crud)
1452 (content-model . EMPTY))
1453 ((%netscape-crud)
1454 ;; Special non-SGML treatment of Netscape's shit.
1455 (content-model . XINHERIT))
1456 ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA)
1457 ((form)
1458 ;; Same as BODY. Ugh!
1459 (content-model . [((%body.content)
1460 nil
1461 ;; Push <P> before data characters. Non-SGML.
1462 (((%text) p))
1463 nil)])
1464 (exclusions . (form))
1465 (inclusions . (input select textarea keygen label)))
1466 ;; *** Where is the URL describing this?
1467 ((label)
1468 (content-model . [((%text)
1469 include-space
1470 nil
1471 nil)])
1472 ;; *** These are already included, no need to repeat.
1473 ;;(inclusions . (input select textarea))
1474 ;; *** Is a LABEL allowed inside a LABEL? I assume no.
1475 (exclusions . (label))
1476 ;; The next line just does the default so is unneeded:
1477 ;;(end-tag-omissible . nil)
1478 )
1479 ;; SELECT - - (OPTION+) -(INPUT|KEYGEN|TEXTAREA|SELECT)>
1480 ;; *** This should be -(everything).
1481 ((select)
1482 (content-model . [((option) nil nil nil)])
1483 (exclusions . (input label select keygen textarea)))
1484 ;; option - O (#PCDATA)
1485 ;; needs to be #PCDATA to allow omitted end tag.
1486 ((option)
1487 ;; I'd like to make this RCDATA to avoid problems with inclusions
1488 ;; like SPOT, but that would conflict with the omitted end-tag, I
1489 ;; think.
1490 (content-model . [((*data)
1491 include-space
1492 (((option) *close))
1493 nil)])
1494 (end-tag-omissible . t))
1495 ;; TEXTAREA - - (#PCDATA) -(INPUT|TEXTAREA|KEYGEN|SELECT)
1496 ((textarea)
1497 ;; Same comment as for OPTION about RCDATA.
1498 (content-model . [((*data) include-space nil nil)])
1499 (exclusions . (input select label keygen textarea)))
1500 ((hr br img isindex input keygen overlay wbr spot tab
1501 %headempty %mathdelims)
1502 (content-model . EMPTY))
1503 ((nextid)
1504 (content-model . EMPTY)
1505 (deprecated . t))
1506 ((a)
1507 (content-model . [((%text)
1508 include-space
1509 (((%heading)
1510 *include *same "deprecated inside A")
1511 ;; *** I haven't made up my mind whether this
1512 ;; is a good idea. It can result in a lot of
1513 ;; bad formatting if the A is *never* closed.
1514 ;;((p) *discard *same error)
1515 )
1516 nil)])
1517 (exclusions . (a)))
1518 ((b %font %phrase %misc nobr %emacsw3-crud-nonempty)
1519 (content-model . [((%text)
1520 include-space
1521 ((%in-text-ignore))
1522 nil)]))
1523 ((plaintext)
1524 (content-model . XXCDATA)
1525 (end-tag-omissible . t)
1526 (deprecated . obsolete))
1527 ((xmp listing)
1528 (content-model . XCDATA)
1529 (deprecated . obsolete))
1530 ;; Latest table spec (as of Nov. 13 1995) is at:
1531 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-tables-03.txt>
1532 ((table)
1533 (content-model . [(nil
1534 nil
1535 (((caption) *include *next)
1536 ((col colgroup thead tfoot tbody tr) *retry *next))
1537 (*retry *next)) ;error handling
1538 ((col colgroup)
1539 nil
1540 (((thead tfoot tbody tr) *retry *next))
1541 (*retry *next)) ;error handling
1542 (nil
1543 nil
1544 (((thead) *include *next)
1545 ((tfoot tbody tr) *retry *next))
1546 (*retry *next)) ;error handling
1547 (nil
1548 nil
1549 (((tfoot) *include *next)
1550 ((tbody tr) *retry *next))
1551 (*retry *next)) ;error handling
1552 ((tbody)
1553 nil
1554 (((tr) tbody *same)
1555 ;; error handling
1556 ((%body.content) tbody *same error))
1557 nil)]))
1558 ((colgroup)
1559 (content-model . [((col)
1560 nil
1561 (((colgroup thead tfoot tbody tr) *close))
1562 nil)])
1563 (end-tag-omissible . t))
1564 ((col)
1565 (content-model . EMPTY))
1566 ((thead)
1567 (content-model . [((tr)
1568 nil
1569 (((tfoot tbody) *close)
1570 ;; error handling
1571 ((%body.content) tr *same error))
1572 nil)])
1573 (end-tag-omissible . t))
1574 ((tfoot tbody)
1575 (content-model . [((tr)
1576 nil
1577 (((tbody) *close)
1578 ;; error handling
1579 ((%body.content) tr *same error))
1580 nil)])
1581 (end-tag-omissible . t))
1582 ((tr)
1583 (content-model . [((td th)
1584 nil
1585 (((tr tfoot tbody) *close)
1586 ;; error handling
1587 ((%body.content) td *same error))
1588 nil)])
1589 (end-tag-omissible . t))
1590 ((td th)
1591 ;; Arrgh! Another %body.content!!! Stupid!!!
1592 (content-model . [((%body.content)
1593 nil
1594 (((td th tr tfoot tbody) *close)
1595 ;; Push <P> before data characters. Non-SGML.
1596 ((%text) p))
1597 nil)])
1598 (end-tag-omissible . t))
1599 ((math)
1600 (content-model . [((*data) include-space nil nil)])
1601 (overrides .
1602 ((w3-p-d-shortref-chars t . "\{_^")
1603 (w3-p-d-shortrefs t . (("\\^" . "<sup>")
1604 ("_" . "<sub>")
1605 ("{" . "<box>")))))
1606 (inclusions . (%math))
1607 (exclusions . (%notmath)))
1608 ((sup)
1609 (content-model . [((%text)
1610 include-space
1611 ((%in-text-ignore))
1612 nil)])
1613 (overrides .
1614 ((w3-p-d-shortref-chars t . "\{_^")
1615 (w3-p-d-shortrefs t . (("\\^" . "</sup>")
1616 ("_" . "<sub>")
1617 ("{" . "<box>"))))))
1618 ((sub)
1619 (content-model . [((%text)
1620 include-space
1621 ((%in-text-ignore))
1622 nil)])
1623 (overrides .
1624 ((w3-p-d-shortref-chars t . "\{_^")
1625 (w3-p-d-shortrefs t . (("\\^" . "<sup>")
1626 ("_" . "</sub>")
1627 ("{" . "<box>"))))))
1628 ((box)
1629 (content-model . [((%formula)
1630 include-space
1631 (((left) *include 1)
1632 ((over atop choose) *include 2)
1633 ((right) *include 3))
1634 nil)
1635 ((%formula)
1636 include-space
1637 (((over atop choose) *include 2)
1638 ((right) *include 3))
1639 nil)
1640 ((%formula)
1641 include-space
1642 (((right) *include 3))
1643 nil)
1644 ((%formula) include-space nil nil)])
1645 (overrides .
1646 ((w3-p-d-shortref-chars t . "{}_^")
1647 (w3-p-d-shortrefs t . (("\\^" . "<sup>")
1648 ("_" . "<sub>")
1649 ("{" . "<box>")
1650 ("}" . "</box>"))))))
1651 ((above below %mathvec t bt sqrt)
1652 (content-model . [((%formula) include-space nil nil)]))
1653 ;; ROOT has a badly-specified content-model in HTML 3.0.
1654 ((root)
1655 (content-model . [((%formula)
1656 include-space
1657 (((of) *include *next))
1658 nil)
1659 ((%formula) include-space nil nil)]))
1660 ((of)
1661 (content-model . [((%formula) include-space nil nil)])
1662 ;; There is no valid way to infer a missing end-tag for OF. This
1663 ;; is bizarre.
1664 (end-tag-omissible . t))
1665 ((array)
1666 (content-model . [((row) nil nil nil)]))
1667 ((row)
1668 (content-model . [((item) nil (((row) *close)) nil)])
1669 (end-tag-omissible . t))
1670 ((item)
1671 (content-model . [((%formula)
1672 include-space
1673 (((row item) *close))
1674 nil)])
1675 (end-tag-omissible . t))
1676 ;; The old parser would look for the </EMBED> end-tag and include
1677 ;; the contents between <EMBED> and </EMBED> as the DATA attribute
1678 ;; of the EMBED start-tag. However, it did not require the
1679 ;; </EMBED> end-tag and did nothing if it was missing. This is
1680 ;; completely impossible to specify in SGML.
1681 ;;
1682 ;; See
1683 ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0603.html>
1684 ;;
1685 ;; Questions: Does EMBED require the end-tag? How does NOEMBED fit
1686 ;; into this? Where can EMBED appear?
1687 ;;
1688 ;; Nov. 25 1995: a new spec for EMBED (also an I-D):
1689 ;; <URL:http://www.cs.princeton.edu/~burchard/www/interactive/>
1690 ;;
1691 ;; Here is my guess how to code EMBED:
1692 ((embed)
1693 (content-model . [((noembed) nil nil (*close))]))
1694 ((noembed)
1695 (content-model . [((%body.content) ; hack hack hack
1696 nil
1697 (((%text) p))
1698 nil)]))
1699 ;;
1700 ;; FRAMESET is a Netscape thing.
1701 ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0588.html>
1702 ((frameset)
1703 (content-model . [((noframes frame frameset) nil nil nil)]))
1704 ((noframes)
1705 (content-model . [((%body.content)
1706 nil
1707 ;; Push <P> before data characters. Non-SGML.
1708 (((%text) p))
1709 nil)]))
1710 ((frame)
1711 (content-model . EMPTY))
1712 ;;
1713 ;; APPLET is a Java thing.
1714 ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README>
1715 ((applet)
1716 ;; I really don't want to add another ANY content-model.
1717 (content-model . XINHERIT)
1718 (inclusions . (param)))
1719 ((param)
1720 (content-model . EMPTY))
1721 ;; backward compatibility with old Java.
1722 ((app)
1723 (content-model . EMPTY))
1724 ;; Client-side image maps.
1725 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-seidman-clientsideimagemap-01.txt>
1726 ;; *** The only problem is that I don't know in what elements MAP
1727 ;; can appear, so none of this is reachable yet.
1728 ((map)
1729 (content-model . [((area) nil nil nil)]))
1730 ((area)
1731 (content-model . EMPTY))
1732 )))))
1733
1734
1735 ;;;
1736 ;;; Omitted tag inference using state transition tables.
1737 ;;;
1738
1739 (eval-when-compile
1740
1741 (w3-p-s-var-def w3-p-s-includep)
1742 (w3-p-s-var-def w3-p-s-state-transitions)
1743 (w3-p-s-var-def w3-p-s-transition)
1744 (w3-p-s-var-def w3-p-s-tran-list)
1745 (w3-p-s-var-def w3-p-s-content-model)
1746 (w3-p-s-var-def w3-p-s-except)
1747 ;; Uses free variables:
1748 ;; w3-p-d-current-element, w3-p-d-exceptions
1749 ;; Destroys free variables:
1750 ;; w3-p-s-includep, w3-p-s-state-transitions, w3-p-s-transition,
1751 ;; w3-p-s-tran-list, w3-p-s-content-model, w3-p-s-except
1752 ;; Returns t if the element or data characters should be included.
1753 ;; Returns nil if the element or data characters should be discarded.
1754 (defsubst w3-grok-tag-or-data (tag-name)
1755 (while
1756 (cond
1757 ((symbolp (setq w3-p-s-content-model
1758 (w3-element-content-model w3-p-d-current-element)))
1759 (or (and (memq w3-p-s-content-model
1760 '(CDATA RCDATA XCDATA XXCDATA))
1761 (memq tag-name '(*data *space)))
1762 ;; *** Implement ANY.
1763 (error "impossible"))
1764 (setq w3-p-s-includep t)
1765 ;; Exit loop.
1766 nil)
1767 (t
1768 ;; We have a complex content model.
1769 ;; Cache some data from the element info structure. Format is:
1770 ;; (INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT)
1771 (setq w3-p-s-state-transitions
1772 (aref w3-p-s-content-model
1773 (w3-element-state w3-p-d-current-element)))
1774
1775 ;; Optimize the common cases.
1776 (cond
1777 ((eq '*space tag-name)
1778 ;; Optimizing the (*space *discard *same nil) transition.
1779 (setq w3-p-s-includep (car (cdr w3-p-s-state-transitions)))
1780 ;; Don't loop.
1781 nil)
1782 ((and (not (setq w3-p-s-except
1783 (assq tag-name w3-p-d-exceptions)))
1784 (memq tag-name (car w3-p-s-state-transitions)))
1785 ;; Equivalent to a transition of (TAG *include *same nil).
1786 ;; So we are done, return t to caller.
1787 (setq w3-p-s-includep t)
1788 ;; Exit loop.
1789 nil)
1790 (t
1791 ;; The general case.
1792 (cond
1793 ;; Handle inclusions and exclusions.
1794 (w3-p-s-except
1795 (setq w3-p-s-transition (cdr w3-p-s-except)))
1796 ;; See if the transition is in the complex transitions
1797 ;; component.
1798 ((progn
1799 (setq w3-p-s-tran-list
1800 (car (cdr (cdr w3-p-s-state-transitions))))
1801 (setq w3-p-s-transition nil)
1802 (while w3-p-s-tran-list
1803 (cond ((memq tag-name (car (car w3-p-s-tran-list)))
1804 ;; We've found a transition.
1805 (setq w3-p-s-transition
1806 (cdr (car w3-p-s-tran-list)))
1807 (setq w3-p-s-tran-list nil))
1808 (t
1809 (setq w3-p-s-tran-list (cdr w3-p-s-tran-list)))))
1810 ;; Check if we found it.
1811 w3-p-s-transition)
1812 ;; body of cond clause empty
1813 )
1814 ;; Try finding the transition in the DEFAULT component of the
1815 ;; transition table, but avoid doing this for unknown elements,
1816 ;; always use the default-default for them.
1817 ((and (or (eq '*data tag-name)
1818 (w3-known-element-p tag-name))
1819 (setq w3-p-s-transition
1820 (nth 3 w3-p-s-state-transitions)))
1821 ;; body of cond clause empty
1822 )
1823 (t
1824 ;; Supply a default-default transition.
1825 (if (not (or (eq '*data tag-name)
1826 (w3-known-element-p tag-name)))
1827 (setq w3-p-s-transition
1828 '(*discard *same "unknown element"))
1829
1830 ;; Decide whether to *close or *discard
1831 ;; based on whether this element would be
1832 ;; accepted as valid in an open ancestor.
1833 (let ((open-list w3-p-d-open-element-stack)
1834 (all-end-tags-omissible
1835 (w3-element-end-tag-omissible w3-p-d-current-element))
1836 state-transitions tran-list)
1837 (if (catch 'found
1838 (while open-list
1839 (setq state-transitions
1840 (aref (w3-element-content-model
1841 (car open-list))
1842 (w3-element-state (car open-list))))
1843 (if (memq tag-name (car state-transitions))
1844 (throw 'found t))
1845 (setq tran-list (nth 2 state-transitions))
1846 (while tran-list
1847 (cond ((memq tag-name (car (car tran-list)))
1848 (if (not (nth 3 (car tran-list)))
1849 ;; Not an error transition.
1850 (throw 'found t))
1851 (setq tran-list nil))
1852 (t
1853 (setq tran-list (cdr tran-list)))))
1854 ;; The input item is not accepted in this
1855 ;; ancestor. Try again in next ancestor.
1856 (or (w3-element-end-tag-omissible (car open-list))
1857 (setq all-end-tags-omissible nil))
1858 (setq open-list (cdr open-list)))
1859 nil)
1860 (setq w3-p-s-transition
1861 (if (w3-element-end-tag-omissible
1862 w3-p-d-current-element)
1863 (if all-end-tags-omissible
1864 ;; Probably indicates a need to debug
1865 ;; the DTD state-transition tables.
1866 '(*close *same
1867 "missing transition in DTD?")
1868 ;; Error will be reported later.
1869 '(*close *same))
1870 '(*close *same "not allowed here")))
1871 (setq w3-p-s-transition
1872 '(*discard *same "not allowed here")))))))
1873
1874 ;; We have found a transition to take. The transition is of
1875 ;; the format (ACTION NEW-STATE ERRORP) where the latter two
1876 ;; items are optional.
1877
1878 ;; First, handle any state-change.
1879 (or (memq (car-safe (cdr w3-p-s-transition)) '(nil *same))
1880 (w3-set-element-state
1881 w3-p-d-current-element
1882 (if (eq '*next (car-safe (cdr w3-p-s-transition)))
1883 (1+ (w3-element-state w3-p-d-current-element))
1884 (car-safe (cdr w3-p-s-transition)))))
1885
1886 ;; Handle any error message.
1887 (if (car-safe (cdr-safe (cdr w3-p-s-transition)))
1888 (w3-debug-html
1889 :mandatory-if (and (eq '*data tag-name)
1890 (eq '*discard (car w3-p-s-transition)))
1891 (format "Bad %s [%s], %s"
1892 (if (eq '*data tag-name)
1893 "data characters"
1894 (concat "start-tag "
1895 (w3-sgml-name-to-string tag-name)))
1896 (if (stringp (car (cdr (cdr w3-p-s-transition))))
1897 (car (cdr (cdr w3-p-s-transition)))
1898 "not allowed here")
1899 (let ((action (car w3-p-s-transition)))
1900 (cond ((eq '*discard action)
1901 "discarding bad item")
1902 ((eq '*close action)
1903 (concat "inferring </"
1904 (w3-sgml-name-to-string
1905 (w3-element-name
1906 w3-p-d-current-element))
1907 ">"))
1908 ((eq '*include action)
1909 "including bad item anyway")
1910 ((eq '*retry action)
1911 "*retry ??? you shouldn't see this")
1912 (t
1913 (concat "inferring <"
1914 (w3-sgml-name-to-string action)
1915 ">")))))))
1916
1917 ;; Handle the action.
1918 (cond
1919 ((eq '*include (car w3-p-s-transition))
1920 (setq w3-p-s-includep t)
1921 ;; Exit loop.
1922 nil)
1923 ((eq '*close (car w3-p-s-transition))
1924 ;; Perform end-tag inference.
1925 (w3-close-element) ; don't pass parameter
1926 ;; Loop and try again in parent element's content-model.
1927 t)
1928 ((eq '*discard (car w3-p-s-transition))
1929 (setq w3-p-s-includep nil)
1930 ;; Exit loop.
1931 nil)
1932 ((eq '*retry (car w3-p-s-transition))
1933 ;; Loop and try again after state change.
1934 t)
1935 ((symbolp (car w3-p-s-transition))
1936 ;; We need to open another element to contain the text,
1937 ;; probably a <P> (look in the state table).
1938 (w3-open-element (car w3-p-s-transition) nil)
1939 ;; Now we loop and try again in the new element's
1940 ;; content-model.
1941 t)
1942 (t
1943 (error "impossible")))))))
1944
1945 ;; Empty while loop body.
1946 )
1947
1948 ;; Return value to user indicating whether to include or discard item:
1949 ;; t ==> include
1950 ;; nil ==> discard
1951 w3-p-s-includep)
1952
1953 )
1954
1955
1956 ;;;
1957 ;;; Main parser.
1958 ;;;
1959
1960 (defvar w3-last-parse-tree nil
1961 "Used for debugging only. Stores the most recently computed parse tree
1962 \(a tree, not a parse tag stream\).")
1963
1964 (defun w3-display-parse-tree (&optional ptree)
1965 (interactive)
1966 (with-output-to-temp-buffer "W3 HTML Parse Tree"
1967 (set-buffer standard-output)
1968 (emacs-lisp-mode)
1969 (require 'pp)
1970 (pp (or ptree w3-last-parse-tree))))
1971
1972 (defalias 'w3-display-last-parse-tree 'w3-display-parse-tree)
1973
1974 ;; For compatibility with the old parser interface.
1975 (defalias 'w3-preparse-buffer 'w3-parse-buffer)
1976
1977 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1978 ;; % %
1979 ;; % This is the *ONLY* valid entry point in this file! %
1980 ;; % DO NOT call any of the other functions! %
1981 ;; % %
1982 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1983 (defun w3-parse-buffer (&optional buff nodraw)
1984 "Parse contents of BUFF as HTML.
1985 BUFF defaults to the value of url-working-buffer.
1986 Destructively alters contents of BUFF.
1987 Unless optional second argument NODRAW is non-nil, calls the display
1988 engine on the parsed HTML.
1989 Returns a data structure containing the parsed information."
1990
1991 (set-buffer (or buff url-working-buffer))
1992 (setq buff (current-buffer))
1993 (set-syntax-table w3-sgml-md-syntax-table)
1994 (buffer-disable-undo (current-buffer))
1995 (widen) ; sanity checking
1996 (goto-char (point-min))
1997 (setq case-fold-search t) ; allows smaller regexp patterns
1998
1999 ;; Some unknown pre-parse buffer munging.
2000 (if (fboundp 'sera-to-fidel-marker)
2001 (let ((sera-being-called-by-w3 t))
2002 ;; eval stops the compiler from complaining.
2003 (eval '(sera-to-fidel-marker))))
2004 (goto-char (point-min))
2005
2006 ;; *** Should premunge line boundaries.
2007 ;; ********************
2008
2009 ;; Prepare another buffer to draw in unless told not to.
2010 (if (not nodraw)
2011 (w3-prepare-draw-buffer-for-parse-buffer))
2012
2013 (let* (
2014 ;; Speed hack, see the variable doc string.
2015 (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0)
2016 (* w3-gc-cons-threshold-multiplier
2017 gc-cons-threshold)
2018 gc-cons-threshold))
2019
2020 ;; Used to determine if we made any progress since the last loop.
2021 (last-loop-start (point-min))
2022
2023 ;; How many iterations of the main loop have occurred. Used only
2024 ;; to send messages to the user periodically, since this function
2025 ;; can take some time.
2026 (loop-count 0)
2027
2028 ;; Precomputing the loop-invariant parts of this for speed.
2029 (status-message-format
2030 (if url-show-status
2031 (format "Parsed %%3d%%%% of %d..." (- (point-max) (point-min)))))
2032
2033 ;; Use a float value for 100 if possible, otherwise integer.
2034 ;; Determine which we can use outside of the loop for speed.
2035 (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100))
2036
2037 ;; Speed up checking whether to do incremental display.
2038 (w3-do-incremental-display (if nodraw nil w3-do-incremental-display))
2039
2040 ;; Used to convert parse tree to tag stream that old display
2041 ;; engine expects. Will change when display engine is rewritten.
2042 (parse-tag-stream '(*dummy))
2043
2044 ;; See doc string.
2045 (w3-p-d-parse-tag-stream-tail-pointer parse-tag-stream)
2046
2047 ;; Points to cons cell in parse-tag-stream whose car is the last
2048 ;; item that has been sent to display engine.
2049 (parse-tag-stream-last-displayed-item parse-tag-stream)
2050
2051 ;; The buffer which contains the HTML we are parsing. This
2052 ;; variable is used to avoid using the more expensive
2053 ;; save-excursion.
2054 (parse-buffer (current-buffer))
2055
2056 ;; Points to start of region of text since the previous tag.
2057 (between-tags-start (point-min))
2058
2059 ;; Points past end of region of text since the previous tag. Only
2060 ;; non-nil when the region has been completely determined and is
2061 ;; ready to be processed.
2062 between-tags-end
2063
2064 ;; See doc string.
2065 w3-p-d-tag-name
2066
2067 ;; See doc string.
2068 w3-p-d-end-tag-p
2069
2070 ;; Is the tag we are looking at a null-end-tag-enabling
2071 ;; start-tag?
2072 net-tag-p
2073
2074 ;; Attributes of the tag we are looking at. An alist whose items
2075 ;; are pairs of the form (SYMBOL . STRING).
2076 tag-attributes
2077
2078 ;; Points past end of attribute value we are looking at. Points
2079 ;; past the syntactic construct, not the value of the attribute,
2080 ;; which may be at (1- attribute-value-end).
2081 attribute-value-end
2082
2083 ;; Points past end of tag we are looking at.
2084 tag-end
2085
2086 ;; See doc string.
2087 (w3-p-d-current-element (w3-fresh-element-for-tag '*document))
2088
2089 ;; See doc string.
2090 (w3-p-d-open-element-stack (list (w3-fresh-element-for-tag '*holder)))
2091
2092 ;; ***not implemented yet***
2093 (marked-section-undo-stack nil)
2094
2095 ;; See doc string.
2096 (w3-p-d-debug-url t)
2097
2098 ;; Any of the following variables with the comment ";*NESTED*"
2099 ;; are syntactic or semantic features that were introduced by
2100 ;; some containing element or marked section which will be undone
2101 ;; when we close that element or marked section.
2102
2103 ;; See doc string.
2104 (w3-p-d-non-markup-chars nil) ;*NESTED*
2105
2106 ;; See doc string.
2107 (w3-p-d-null-end-tag-enabled nil) ;*NESTED*
2108
2109 ;; See doc string.
2110 (w3-p-d-in-parsed-marked-section nil) ;*NESTED*
2111
2112 ;; See doc string.
2113 (w3-p-d-shortrefs nil) ;*NESTED*
2114
2115 ;; See doc string.
2116 (w3-p-d-shortref-chars nil) ;*NESTED*
2117
2118 ;; ******* maybe not needed.
2119 ;;
2120 ;; ;; Are we recognizing start-tags?
2121 ;; (recognizing-start-tags t) ;*NESTED*
2122 ;;
2123 ;; ;; Are we recognizing end-tags? If this is non-nil and not t,
2124 ;; ;; then only the end tag of the current open element is
2125 ;; ;; recognized.
2126 ;; (recognizing-end-tags t) ;*NESTED*
2127
2128 ;; See doc string.
2129 (w3-p-d-exceptions nil) ;*NESTED*
2130
2131 ;; Scratch variables used in this function
2132 ref attr-name attr-value content-model content open-list
2133 )
2134 ;; Scratch variables used by macros and defsubsts we call.
2135 (w3-p-s-let-bindings
2136
2137 (w3-update-non-markup-chars)
2138
2139 ;; Main loop. Handle markup as follows:
2140 ;;
2141 ;; non-empty tag: Handle the region since the previous tag as PCDATA,
2142 ;; RCDATA, CDATA, if allowed by syntax. Then handle the tag.
2143 ;;
2144 ;; general entity (&name;): expand it and parse the result.
2145 ;;
2146 ;; shortref (_, {, }, and ^ in math stuff): Expand it and parse the
2147 ;; result.
2148 ;;
2149 ;; SGML marked section (<![ keywords [ conditional-text ]]>): Either
2150 ;; strip the delimiters and parse the result or delete.
2151 ;;
2152 ;; comment: Delete.
2153 ;;
2154 ;; empty tag (<>, </>): Handle as the appropriate tag.
2155 ;;
2156 ;; markup declaration (e.g. <!DOCTYPE ...>): Delete.
2157 ;;
2158 ;; SGML processing instruction (<?name>): Delete.
2159 ;;
2160 (while
2161 ;; Continue as long as we processed something last time and we
2162 ;; have more to process.
2163 (prog1
2164 (not (and (= last-loop-start (point))
2165 (eobp)))
2166 (setq last-loop-start (point)))
2167
2168 ;; Display progress messages if asked and/or do incremental display
2169 ;; of results
2170 (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40))
2171 (if w3-do-incremental-display
2172 (w3-pause))
2173 (if status-message-format
2174 (message status-message-format
2175 ;; Percentage of buffer processed.
2176 (/ (* (point) one-hundred) (point-max))))))
2177
2178 ;; Go to next interesting thing in the buffer.
2179 (skip-chars-forward w3-p-d-non-markup-chars)
2180
2181 ;; We are looking at a markup-starting character, and invalid
2182 ;; character, or end of buffer.
2183 (cond
2184
2185 ((= ?< (following-char))
2186
2187 ;; We are looking at a tag, comment, markup declaration, SGML marked
2188 ;; section, SGML processing instruction, or non-markup "<".
2189 (forward-char)
2190 (cond
2191
2192 ((looking-at "/?\\([a-z][-a-z0-9.]*\\)")
2193 ;; We are looking at a non-empty tag.
2194
2195 (setq w3-p-d-tag-name
2196 (intern (downcase (buffer-substring (match-beginning 1)
2197 (match-end 1)))))
2198 (setq w3-p-d-end-tag-p (= ?/ (following-char)))
2199 (setq between-tags-end (1- (point)))
2200 (goto-char (match-end 0))
2201
2202 ;; Read the attributes from a start-tag.
2203 (or
2204 w3-p-d-end-tag-p
2205
2206 ;; Attribute values can be:
2207 ;; "STRING" where STRING does not contain the double quote
2208 ;; 'STRING' where STRING does not contain the single quote
2209 ;; name-start character, *name character
2210 ;; *name character
2211 ;; Digit, +name character
2212 ;; +Digit
2213 ;; or a SPACE-separated list of one of the last four
2214 ;; possibilities (there is a comment somewhere that this is a
2215 ;; misinterpretation of the grammar, so we ignore this
2216 ;; possibility).
2217 (while
2218 (looking-at
2219 (eval-when-compile
2220 (concat
2221 ;; Leading whitespace.
2222 "[ \n\r\t]*"
2223 ;; The attribute name, possibly with a bad syntax
2224 ;; component.
2225 "\\([a-z][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)"
2226 ;; Trailing whitespace and perhaps an "=".
2227 "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)")))
2228
2229 (cond ((/= (match-beginning 2) (match-end 2))
2230 (w3-debug-html
2231 :nocontext
2232 (format "Bad attribute name syntax: %s"
2233 (buffer-substring (match-beginning 1)
2234 (match-end 1))))))
2235
2236 (setq attr-name
2237 (intern (downcase (buffer-substring (match-beginning 1)
2238 (match-end 1)))))
2239 (goto-char (match-end 0))
2240 (cond
2241 ((< (match-beginning 4) (match-end 4))
2242 ;; A value was specified (e.g. ATTRIBUTE=VALUE).
2243 (cond
2244 ((looking-at
2245 (eval-when-compile
2246 (concat
2247 ;; Literal with double quotes.
2248 "\"\\([^\"]*\\)\""
2249 "\\|"
2250 ;; Literal with single quotes.
2251 "'\\([^']\\)*'"
2252 "\\|"
2253 ;; Handle bad HTML conflicting with NET-enabling
2254 ;; start-tags.
2255 "\\([-a-z0-9.]+/[-a-z0-9._/#]+\\)[ \t\n\r>]"
2256 "\\|"
2257 ;; SGML NAME-syntax attribute value.
2258 "\\([-a-z0-9.]+\\)[ \t\n\r></]"
2259 )))
2260 (cond
2261 ((or (match-beginning 1)
2262 (match-beginning 2))
2263 ;; We have an attribute value literal.
2264 (narrow-to-region (1+ (match-beginning 0))
2265 (1- (match-end 0)))
2266
2267 ;; In attribute value literals, EE and RS are ignored
2268 ;; and RE and SEPCHAR characters sequences are
2269 ;; replaced by SPACEs.
2270 ;;
2271 ;; (There is no way right now to get RS into one of
2272 ;; these so that it can be ignored. This is due to
2273 ;; our using Unix line-handling conventions.)
2274 (skip-chars-forward "^&\t\n\r")
2275 (if (eobp)
2276 nil
2277 ;; We must expand entities and replace RS, RE,
2278 ;; and SEPCHAR.
2279 (goto-char (point-min))
2280 (while (progn
2281 (skip-chars-forward "^&")
2282 (not (eobp)))
2283 (w3-expand-entity-at-point-maybe))
2284 (subst-char-in-region (point-min) (point-max) ?\t 32)
2285 (subst-char-in-region (point-min) (point-max) ?\n 32))
2286 ;; Set this after we have changed the size of the
2287 ;; attribute.
2288 (setq attribute-value-end (1+ (point-max))))
2289 ((match-beginning 4)
2290 (setq attribute-value-end (match-end 4))
2291 (narrow-to-region (point) attribute-value-end))
2292 ((match-beginning 3)
2293 (setq attribute-value-end (match-end 3))
2294 (narrow-to-region (point) attribute-value-end)
2295 ;; Horribly illegal non-SGML handling of bad
2296 ;; HTML on the net. This can break valid HTML.
2297 (setq attr-value (buffer-substring (point)
2298 (match-end 3)))
2299 (w3-debug-html :nocontext
2300 (format "Evil attribute value syntax: %s"
2301 (buffer-substring (point-min) (point-max)))))
2302 (t
2303 (error "impossible"))))
2304 ((memq (following-char) '(?\" ?'))
2305 ;; Missing terminating quote character.
2306 (narrow-to-region (point)
2307 (progn
2308 (forward-char 1)
2309 (skip-chars-forward "^ \t\n\r'\"=<>")
2310 (setq attribute-value-end (point))))
2311 (w3-debug-html :nocontext
2312 (format "Attribute value missing end quote: %s"
2313 (buffer-substring (point-min) (point-max))))
2314 (narrow-to-region (1+ (point-min)) (point-max)))
2315 (t
2316 ;; We have a syntactically invalid attribute value. Let's
2317 ;; make a best guess as to what the author intended.
2318 (narrow-to-region (point)
2319 (progn
2320 (skip-chars-forward "^ \t\n\r'\"=<>")
2321 (setq attribute-value-end (point))))
2322 (w3-debug-html :nocontext
2323 (format "Bad attribute value syntax: %s"
2324 (buffer-substring (point-min) (point-max))))))
2325 ;; Now we have isolated the attribute value. We need to
2326 ;; munge the value depending on the syntax of the
2327 ;; attribute.
2328 ;; *** Right now, we only implement the necessary munging
2329 ;; for CDATA attributes, which is none. I'm not sure why
2330 ;; this happens to work for other attributes right now.
2331 ;; For any other kind of attribute, we are supposed to
2332 ;; * smash case
2333 ;; * remove leading/trailing whitespace
2334 ;; * smash multiple space sequences into single spaces
2335 ;; * verify the syntax of each token
2336 (setq attr-value (buffer-substring (point-min) (point-max)))
2337 (widen)
2338 (goto-char attribute-value-end))
2339 (t
2340 ;; No value was specified, in which case NAME should be
2341 ;; taken as ATTRIBUTE=NAME where NAME is one of the
2342 ;; enumerated values for ATTRIBUTE.
2343 ;; We assume here that ATTRIBUTE is the same as NAME.
2344 ;; *** Another piece of code will fix the attribute name if it
2345 ;; is wrong.
2346 (setq attr-value (symbol-name attr-name))))
2347
2348 ;; Accumulate the attributes.
2349 (setq tag-attributes (cons (cons attr-name attr-value)
2350 tag-attributes))))
2351
2352 ;; Process the end of the tag.
2353 (skip-chars-forward " \t\n\r")
2354 (cond ((= ?> (following-char))
2355 ;; Ordinary tag end.
2356 (forward-char 1))
2357 ((and (= ?/ (following-char))
2358 (not w3-p-d-end-tag-p))
2359 ;; This is a NET-enabling start-tag.
2360 (setq net-tag-p t)
2361 (forward-char 1))
2362 ((= ?< (following-char))
2363 ;; *** Strictly speaking, the following text has to
2364 ;; lexically be STAGO or ETAGO, which means that it
2365 ;; can't match some other lexical unit.
2366 ;; Unclosed tag.
2367 nil)
2368 (t
2369 ;; Syntax error.
2370 (w3-debug-html
2371 (format "Bad unclosed %s%s tag"
2372 (if w3-p-d-end-tag-p "/" "")
2373 (w3-sgml-name-to-string w3-p-d-tag-name)))))
2374
2375 (setq tag-end (point)))
2376
2377 ((looking-at "/?>")
2378 ;; We are looking at an empty tag (<>, </>).
2379 (setq w3-p-d-end-tag-p (= ?/ (following-char)))
2380 (setq w3-p-d-tag-name (if w3-p-d-end-tag-p
2381 (w3-element-name w3-p-d-current-element)
2382 ;; *** Strictly speaking, if OMITTAG NO, then
2383 ;; we should use the most recently closed tag.
2384 ;; But OMITTAG YES in HTML and I'm lazy.
2385 (w3-element-name w3-p-d-current-element)))
2386 (setq tag-attributes nil)
2387 ;; *** Make sure this is not at top level.
2388 (setq between-tags-end (1- (point)))
2389 (setq tag-end (match-end 0)))
2390
2391 ;; *** In SGML, <(doctype)element> is valid tag syntax. This
2392 ;; cannot occur in HTML because the CONCUR option is off in the
2393 ;; SGML declaration.
2394
2395 ((looking-at "!--")
2396 ;; We found a comment, delete to end of comment.
2397 (delete-region
2398 (1- (point))
2399 (progn
2400 (forward-char 1)
2401 ;; Skip over pairs of -- ... --.
2402 (if (looking-at "\\(--[^-]*\\(-[^-]+\\)*--[ \t\r\n]*\\)+>")
2403 (goto-char (match-end 0))
2404 ;; Syntax error!
2405 (w3-debug-html
2406 "Bad comment (unterminated or unbalanced \"--\" pairs)")
2407 (forward-char 2)
2408 (or (re-search-forward "--[ \t\r\n]*>" nil t)
2409 (search-forward ">" nil t)))
2410 (point))))
2411
2412 ((looking-at "!>\\|\\?[^>]*>")
2413 ;; We are looking at an empty comment or a processing
2414 ;; instruction. Delete it.
2415 (replace-match "")
2416 (delete-char -1))
2417
2418 ((looking-at "![a-z]")
2419 ;; We are looking at a markup declaration. Delete it.
2420 ;; *** Technically speaking, to handle valid HTML I think we
2421 ;; need to handle "<!USEMAP ... >" declarations. In the future,
2422 ;; to handle general SGML, we should parse "<!DOCTYPE ... >"
2423 ;; declarations as well (which can contain other declarations).
2424 ;; In the very distant future, perhaps we will handle "<!SGML
2425 ;; ... >" declarations.
2426 ;; *** Should warn if it's not SGML, DOCTYPE, or USEMAP.
2427 (backward-char 1)
2428 (delete-region
2429 (point)
2430 (progn
2431 (condition-case nil
2432 (forward-sexp 1)
2433 (error
2434 ;; *** This might not actually be bad syntax, but might
2435 ;; instead be a -- ... -- comment with unbalanced
2436 ;; parentheses somewhere inside the declaration. Handling
2437 ;; this properly would require full parsing of markup
2438 ;; declarations, a goal for the future.
2439 (w3-debug-html "Bad <! syntax.")
2440 (skip-chars-forward "^>")
2441 (if (= ?> (following-char))
2442 (forward-char))))
2443 (point))))
2444
2445 ((looking-at "!\\\[\\(\\([ \t\n\r]*[a-z]+\\)+[ \t\n\r]*\\)\\\[")
2446 ;; We are looking at a marked section.
2447 ;; *** Strictly speaking, we should issue a warning if the
2448 ;; keywords are invalid or missing or if the "[" does not follow.
2449 ;; We must look at the keywords to understand how to parse it.
2450 ;; *** Strictly speaking, we should perform parameter entity
2451 ;; substitution on the keywords first.
2452 (goto-char (match-beginning 1))
2453 (insert ?\))
2454 (goto-char (1- (match-beginning 0)))
2455 (delete-char 3)
2456 (insert ?\()
2457 (backward-char 1)
2458 (let* ((keywords (read (current-buffer)))
2459 ;; Multiple keywords may appear, but only the most
2460 ;; significant takes effect. Rank order is IGNORE, CDATA,
2461 ;; RCDATA, INCLUDE, and TEMP. INCLUDE and TEMP have the
2462 ;; same effect.
2463 (keyword (car-safe (cond ((memq 'IGNORE keywords))
2464 ((memq 'CDATA keywords))
2465 ((memq 'RCDATA keywords))
2466 ((memq 'INCLUDE keywords))
2467 ((memq 'TEMP keywords))))))
2468 (or (= ?\[ (following-char))
2469 ;; I probably shouldn't even check this, since it is so
2470 ;; impossible.
2471 (error "impossible"))
2472 (forward-char 1)
2473 (delete-region (1- (match-beginning 0)) (point))
2474 (cond ((eq 'IGNORE keyword)
2475 ;; Scan forward skipping over matching <![ ... ]]>
2476 ;; until we find an unmatched "]]>".
2477 (let ((ignore-nesting 1)
2478 (start-pos (point)))
2479 (while (> ignore-nesting 0)
2480 (if (re-search-forward "<!\\\\\[\\|\]\]>" nil t)
2481 (setq ignore-nesting
2482 (if (eq ?> (preceding-char))
2483 (1- ignore-nesting)
2484 (1+ ignore-nesting)))
2485 (w3-debug-html
2486 "Unterminated IGNORE marked section.")
2487 (setq ignore-nesting 0)
2488 (goto-char start-pos)))
2489 (delete-region start-pos (point))))
2490 ((eq 'CDATA keyword)
2491 (error "***unimplemented***"))
2492 ((eq 'RCDATA keyword)
2493 (error "***unimplemented***"))
2494 ((memq keyword '(INCLUDE TEMP))
2495 (error "***unimplemented***")))))
2496 ((and (looking-at "!")
2497 w3-netscape-compatible-comments)
2498 ;; Horribly illegal non-SGML handling of bad HTML on the net.
2499 ;; This can break valid HTML.
2500 ;; This arises because Netscape discards anything looking like
2501 ;; "<!...>". So people expect they can use this construct as
2502 ;; a comment.
2503 (w3-debug-html "Evil <! comment syntax.")
2504 (backward-char 1)
2505 (delete-region
2506 (point)
2507 (progn
2508 (skip-chars-forward "^>")
2509 (if (= ?> (following-char))
2510 (forward-char))
2511 (point))))
2512 (t
2513 ;; This < is not a markup character. Pretend we didn't notice
2514 ;; it at all. We have skipped over the < already, so just loop
2515 ;; again.
2516 )))
2517
2518 ((= ?& (following-char))
2519 (w3-expand-entity-at-point-maybe))
2520
2521 ((and (= ?\] (following-char))
2522 w3-p-d-in-parsed-marked-section
2523 (looking-at "]]>"))
2524 ;; *** handle the end of a parsed marked section.
2525 (error "***unimplemented***"))
2526
2527 ((and (= ?/ (following-char))
2528 w3-p-d-null-end-tag-enabled)
2529 ;; We are looking at a null end tag.
2530 (setq w3-p-d-end-tag-p t)
2531 (setq between-tags-end (point))
2532 (setq tag-end (1+ (point)))
2533 (setq w3-p-d-tag-name (w3-element-name w3-p-d-current-element)))
2534
2535 ;; This can be slow, since we'll hardly ever get here.
2536 ;; *** Strictly speaking, I think we're supposed to handle
2537 ;; shortrefs that begin with the same characters as other markup,
2538 ;; preferring the longest match.
2539 ;; I will assume that shortrefs never begin with <, &, \], /.
2540 ((setq ref (catch 'found-shortref
2541 (let ((refs w3-p-d-shortrefs))
2542 (while refs
2543 (if (looking-at (car (car refs)))
2544 (throw 'found-shortref (cdr (car refs))))
2545 (setq refs (cdr refs))))))
2546 ;; We are looking at a shortref for which there is an
2547 ;; expansion defined in the current syntax. Replace with the
2548 ;; expansion, leaving point at the beginning so it will be parsed
2549 ;; on the next loop.
2550 ;; *** eek. This is wrong if the shortref is for an entity with
2551 ;; CDATA syntax which should not be reparsed for tags.
2552 (replace-match "")
2553 (let ((pt (point)))
2554 (insert ref)
2555 (goto-char pt)))
2556
2557 ((looking-at (eval-when-compile
2558 (concat "[" (w3-invalid-sgml-chars) "]")))
2559 (w3-debug-html
2560 (format "Invalid SGML character: %c" (following-char)))
2561 (insert (or (cdr-safe (assq (following-char)
2562 ;; These characters are apparently
2563 ;; from a Windows character set.
2564 '((146 . "'")
2565 (153 . "TM"))))
2566 ""))
2567 (delete-char 1))
2568
2569 ((eobp)
2570 ;; We have finished the buffer. Make sure we process the last
2571 ;; piece of text, if any.
2572 (setq between-tags-end (point))
2573 ;; We have to test what's on the element stack because this
2574 ;; piece of code gets executed twice.
2575 (cond ((not (eq '*holder (w3-element-name w3-p-d-current-element)))
2576 ;; This forces the calculation of implied omitted end tags.
2577 (setq w3-p-d-tag-name '*document)
2578 (setq w3-p-d-end-tag-p t)
2579 (setq tag-end (point)))))
2580
2581 (t
2582 (error "unreachable code, this can't happen")))
2583
2584 ;; If we have determined the boundaries of a non-empty between-tags
2585 ;; region of text, then handle it.
2586 (cond
2587 (between-tags-end
2588 (cond
2589 ((< between-tags-start between-tags-end)
2590 ;; We have a non-empty between-tags region.
2591
2592 ;; We check if it's entirely whitespace, because we record the
2593 ;; transitions for whitespace separately from those for
2594 ;; data with non-whitespace characters.
2595 (goto-char between-tags-start)
2596 (skip-chars-forward " \t\n\r" between-tags-end)
2597 (cond
2598 ((w3-grok-tag-or-data (prog1
2599 (if (= between-tags-end (point))
2600 '*space
2601 '*data)
2602 (goto-char between-tags-end)))
2603 ;; We have to include the text in the current element's
2604 ;; contents. If this is the first item in the current
2605 ;; element's contents, don't include a leading newline if
2606 ;; there is one. Add a trailing newline as a separate text
2607 ;; item so that it can be removed later if it turns out to
2608 ;; be the last item in the current element's contents when
2609 ;; the current element is closed.
2610 ;; *** We could perform this test before calling
2611 ;; w3-grok-tag-or-data, but it's not clear which will be
2612 ;; faster in practice.
2613 (or (setq content (w3-element-content w3-p-d-current-element))
2614 ;; *** Strictly speaking, in SGML the record end is
2615 ;; carriage return, not line feed.
2616 (if (= ?\n (char-after between-tags-start))
2617 (setq between-tags-start (1+ between-tags-start))))
2618 (if (= between-tags-start (point))
2619 ;; Do nothing.
2620 nil
2621 ;; We are definitely going to add data characters to the
2622 ;; content.
2623 ;; Protocol is that all but last data character item
2624 ;; must have been sent to display engine.
2625 (and content
2626 (stringp (car content))
2627 ;; Gross, disgusting hack to deal with old interface
2628 ;; to display engine. Remove as soon as possible.
2629 (not (memq (w3-element-name w3-p-d-current-element)
2630 '(plaintext style xmp textarea)))
2631 (w3-add-display-item 'text (car content)))
2632 (cond
2633 ((and (= ?\n (preceding-char))
2634 (/= between-tags-start (1- (point))))
2635 (setq content (cons (buffer-substring between-tags-start
2636 (1- (point)))
2637 content))
2638 ;; Gross, disgusting hack to deal with old interface
2639 ;; to display engine. Remove as soon as possible.
2640 (or (memq (w3-element-name w3-p-d-current-element)
2641 '(plaintext style xmp textarea))
2642 (w3-add-display-item 'text (car content)))
2643 (setq content (cons "\n" content)))
2644 (t
2645 (setq content (cons (buffer-substring between-tags-start
2646 (point))
2647 content))))
2648 (w3-set-element-content w3-p-d-current-element content))))))
2649
2650 (setq between-tags-end nil)))
2651
2652 ;; If the previous expression modified (point), then it went to
2653 ;; the value of between-tags-end.
2654
2655 ;; If we found a start or end-tag, we need to handle it.
2656 (cond
2657 (w3-p-d-tag-name
2658
2659 ;; Move past the tag and prepare for next between-tags region.
2660 (goto-char tag-end)
2661 (setq between-tags-start (point))
2662
2663 (cond
2664 (w3-p-d-end-tag-p
2665 ;; Handle an end-tag.
2666 (if (eq w3-p-d-tag-name (w3-element-name w3-p-d-current-element))
2667 (w3-close-element)
2668 ;; Handle the complex version. We have to search up (down?)
2669 ;; the open element stack to find the element that matches (if
2670 ;; any). Then we close all of the elements. On a conforming
2671 ;; SGML document this can do no wrong and it's not
2672 ;; unreasonable on a non-conforming document.
2673
2674 ;; Can't safely modify stack until we know the element we want
2675 ;; to find is in there, so work with a copy.
2676 (setq open-list w3-p-d-open-element-stack)
2677 (while (and open-list
2678 (not (eq w3-p-d-tag-name
2679 (w3-element-name (car open-list)))))
2680 (setq open-list (cdr open-list)))
2681 (cond (open-list
2682 ;; We found a match. Pop elements.
2683 ;; We will use the following value as a sentinel.
2684 (setq open-list (cdr open-list))
2685 (while (not (eq open-list w3-p-d-open-element-stack))
2686 (w3-close-element t))
2687 (w3-close-element))
2688 (t
2689 ;; Bogus end tag.
2690 (w3-debug-html
2691 (format "Unmatched end-tag </%s>"
2692 (w3-sgml-name-to-string w3-p-d-tag-name)))))))
2693 (t
2694 ;; Handle a start-tag.
2695 (cond
2696 ;; Check if the new element is allowed in the current element's
2697 ;; content model.
2698 ((w3-grok-tag-or-data w3-p-d-tag-name)
2699 (w3-open-element w3-p-d-tag-name tag-attributes)
2700
2701 ;; Handle NET-enabling start tags.
2702 (cond ((and net-tag-p
2703 (not w3-p-d-null-end-tag-enabled))
2704 ;; Save old values.
2705 (w3-set-element-undo-list
2706 w3-p-d-current-element
2707 (cons (cons 'w3-p-d-non-markup-chars
2708 w3-p-d-non-markup-chars)
2709 (cons '(w3-p-d-null-end-tag-enabled . nil)
2710 (w3-element-undo-list w3-p-d-current-element))))
2711 ;; Alter syntax.
2712 (setq w3-p-d-null-end-tag-enabled t)
2713 (w3-update-non-markup-chars)))
2714
2715 (setq content-model
2716 (w3-element-content-model w3-p-d-current-element))
2717
2718 ;; If the element does not have parsed contents, then we
2719 ;; can find its contents immediately.
2720 (cond
2721 ((memq content-model '(EMPTY CDATA XCDATA XXCDATA RCDATA))
2722 (cond
2723 ((eq 'EMPTY content-model)
2724 (w3-close-element))
2725 ((eq 'CDATA content-model)
2726 ;; CDATA: all data characters until an end-tag. We'll
2727 ;; process the end-tag on the next loop.
2728 (if (re-search-forward (if w3-p-d-null-end-tag-enabled
2729 "</[a-z>]\\|/"
2730 "</[a-z>]")
2731 nil 'move)
2732 (goto-char (match-beginning 0))))
2733 ((eq 'XCDATA content-model)
2734 ;; XCDATA: special non-SGML-standard mode which includes
2735 ;; all data characters until "</foo" is seen where "foo"
2736 ;; is the name of this element (for XMP and LISTING).
2737 (if (search-forward
2738 (concat "</" (symbol-name
2739 (w3-element-name w3-p-d-current-element)))
2740 nil 'move)
2741 (goto-char (match-beginning 0))))
2742 ((eq 'XXCDATA content-model)
2743 ;; XXCDATA: special non-SGML-standard mode which includes
2744 ;; all data until end-of-entity (end-of-buffer for us)
2745 ;; (for PLAINTEXT).
2746 (goto-char (point-max)))
2747 ((eq 'RCDATA content-model)
2748 ;; RCDATA: all data characters until end-tag is seen,
2749 ;; except that entities are expanded first, although the
2750 ;; expansions are _not_ scanned for end-tags, although the
2751 ;; expansions _are_ scanned for further entity
2752 ;; references.
2753 (while (progn
2754 (if (re-search-forward (if w3-p-d-null-end-tag-enabled
2755 "</[a-z>]\\|[/&]"
2756 "</[a-z>]\\|&")
2757 nil 'move)
2758 (goto-char (match-beginning 0)))
2759 (= ?& (following-char)))
2760 (w3-expand-entity-at-point-maybe)))))))
2761 (t
2762 ;; The element is illegal here. We'll just discard the start
2763 ;; tag as though we never saw it.
2764 ))))
2765
2766 (setq w3-p-d-tag-name nil)
2767 (setq w3-p-d-end-tag-p nil)
2768 (setq net-tag-p nil)
2769 (setq tag-attributes nil)
2770 (setq tag-end nil)))
2771
2772 ;; Hand items to the display engine.
2773 (cond ((not nodraw)
2774 (set-buffer w3-draw-buffer)
2775 (while (not (eq parse-tag-stream-last-displayed-item
2776 w3-p-d-parse-tag-stream-tail-pointer))
2777 (setq parse-tag-stream-last-displayed-item
2778 (cdr parse-tag-stream-last-displayed-item))
2779 ;; We call w3-handle-single-tag from only one spot so that it
2780 ;; is reasonable to inline it, since it is a big function.
2781 (w3-handle-single-tag
2782 (car (car parse-tag-stream-last-displayed-item))
2783 (cdr (car parse-tag-stream-last-displayed-item))))
2784 (set-buffer parse-buffer)))
2785
2786 ;; End of main while loop.
2787 )
2788
2789 ;; We have finished parsing the buffer!
2790 (if status-message-format
2791 (message "%sdone" (format status-message-format 100)))
2792 ;; Do this now so the user can see the full results before Emacs
2793 ;; goes off and garbage-collects for an hour. :-(
2794 (if w3-do-incremental-display
2795 (w3-pause))
2796
2797 ;; *** For debugging, save the true parse tree.
2798 ;; *** Make this look inside *DOCUMENT.
2799 (setq w3-last-parse-tree
2800 (w3-element-content w3-p-d-current-element))
2801
2802 ;; Return the parse in the format expected, a stream of tags
2803 ;; possibly with a buffer at the front.
2804 (if nodraw
2805 ;; Discard the *dummy item at start of list.
2806 (cdr parse-tag-stream)
2807 (cons w3-draw-buffer (cdr parse-tag-stream)))
2808
2809 )))
2810
2811
2812 ;;;
2813 ;;; Initialization of display engine to accept parser output.
2814 ;;;
2815
2816 (defun w3-prepare-draw-buffer-for-parse-buffer ()
2817 (setq list-buffers-directory nil)
2818 (let ((buf (get-buffer-create (url-generate-new-buffer-name
2819 "Untitled")))
2820 (info (mapcar (function (lambda (x) (cons x (symbol-value x))))
2821 w3-persistent-variables)))
2822 (setq w3-draw-buffer buf)
2823 (save-excursion
2824 (set-window-buffer (selected-window) buf)
2825 (set-buffer buf)
2826 (setq w3-draw-buffer (current-buffer))
2827 (erase-buffer)
2828 (buffer-disable-undo (current-buffer))
2829 (mapcar (function (lambda (x) (set (car x) (cdr x)))) info)
2830 (setq w3-last-fill-pos (point))
2831 (setq fill-column (min (- (or w3-strict-width (window-width))
2832 w3-right-border)
2833 (or w3-maximum-line-length (window-width))))
2834 (setq fill-prefix "")
2835 (w3-init-state))))
2836
2837
2838
2839 (provide 'w3-parse)
2840
2841 ;; Local variables:
2842 ;; indent-tabs-mode: nil
2843 ;; end: