Mercurial > hg > xemacs-beta
diff lisp/w3/w3-parse.el @ 0:376386a54a3c r19-14
Import from CVS: tag r19-14
author | cvs |
---|---|
date | Mon, 13 Aug 2007 08:45:50 +0200 |
parents | |
children | ac2d302a0011 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lisp/w3/w3-parse.el Mon Aug 13 08:45:50 2007 +0200 @@ -0,0 +1,2843 @@ +;; Created by: Joe Wells, jbw@csb.bu.edu +;; Created on: Sat Sep 30 17:25:40 1995 +;; Filename: w3-parse.el +;; Purpose: Parse HTML and/or SGML for Emacs W3 browser. + +;; Copyright © 1995 Joseph Brian Wells +;; Copyright © 1993, 1994, 1995 by William M. Perry (wmperry@spry.com) +;; +;; This program is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2 of the License, or +;; (at your option) any later version. +;; +;; This program is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with this program; if not, write to the Free Software +;; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +;; +;; On November 13, 1995, the license was available at +;; <URL:ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0>. It may still be +;; obtainable via that URL. + + +;;; +;;; Trying to make the best of an evil speed hack. +;;; + +;; Explanation: + +;; Basically, this file provides one big function (w3-parse-buffer) and +;; some data structures. However, to avoid code redundancy, I have broken +;; out some common subexpressions of w3-parse-buffer into separate +;; functions. I have declared these separate functions with "defsubst" so +;; they will be inlined into w3-parse-buffer. Also, I have defined them +;; within eval-when-compile forms, so no definitions will be emitted into +;; the .elc file for these separate functions. (They will work normally +;; when the uncompiled file is loaded.) + +;; Each of these subfunctions use some scratch variables in a purely local +;; fashion. In good software design, I would declare these variables as +;; close to their use as possible with "let". However, "let"-binding +;; variables is *SLOW* in Emacs Lisp, even when compiled. Since each of +;; these functions is executed one or more time during each iteration of +;; the main loop, I deemed this too expensive. So the main function does +;; the "let"-binding of these variables. However, I still want to declare +;; them close to their use, partially to keep the compiler from crying +;; "Wolf!" when there is no danger (well, maybe a little danger :-), so I +;; define some macros for this purpose. + +;; Also, there are some variables which are updated throughout the file +;; (remember this is really all one function). Some of the code which +;; updates them is located inside the subfunctions. So that the compiler +;; will not complain, these variables are defined with defvar. + +(require 'w3-vars) + +(eval-when-compile + (defconst w3-p-s-var-list nil + "A list of the scratch variables used by functions called by +w3-parse-buffer which it is w3-parse-buffer's responsibility to +\"let\"-bind.") + + ;; *** This is unused and does not belong right here anyway. + (defmacro w3-resolve-numeric-entity (ent) + "Return a string representing the numeric entity ENT (&#ENT;)" + (` (if (< (, ent) 256) + (char-to-string (, ent)) + (format "[Too large character: %s]" (, ent))))) + + (defmacro w3-p-s-var-def (var) + "Declare VAR as a scratch variable which w3-parse-buffer must +\"let\"-bind." + (` (eval-when-compile + (defvar (, var)) + (or (memq '(, var) w3-p-s-var-list) + (setq w3-p-s-var-list (cons '(, var) w3-p-s-var-list)))))) + + (defmacro w3-p-s-let-bindings (&rest body) + "\"let\"-bind all of the variables in w3-p-s-var-list in BODY." + (` (let (, w3-p-s-var-list) + (,@ body)))) + (put 'w3-p-s-let-bindings 'lisp-indent-function 0) + (put 'w3-p-s-let-bindings 'edebug-form-spec t) + + (defvar w3-p-d-current-element) + (put 'w3-p-d-current-element 'variable-documentation + "Information structure for the current open element.") + + (defvar w3-p-d-exceptions) + (put 'w3-p-d-exceptions 'variable-documentation + "Alist specifying elements (dis)allowed because of an (ex|in)clusion +exception of some containing element (not necessarily the immediately +containing element). Each item specifies a transition for an element +which overrides that specified by the current element's content model. +Each item is of the form (TAG ACTION *same ERRORP).") + + (defvar w3-p-d-in-parsed-marked-section) + (put 'w3-p-d-in-parsed-marked-section 'variable-documentation + "Are we in a parsed marked section so that we have to scan for \"]]>\"?") + + (defvar w3-p-d-non-markup-chars) + (put 'w3-p-d-non-markup-chars 'variable-documentation + "The characters that do not indicate the start of markup, in the format +for an argument to skip-chars-forward.") + + (defvar w3-p-d-null-end-tag-enabled) + (put 'w3-p-d-null-end-tag-enabled 'variable-documentation + "Is the null end tag (\"/\") enabled?") + + (defvar w3-p-d-open-element-stack) + (put 'w3-p-d-open-element-stack 'variable-documentation + "A stack of the currently open elements, with the innermost enclosing +element on top and the outermost on bottom.") + + (defvar w3-p-d-parse-tag-stream-tail-pointer) + (put 'w3-p-d-parse-tag-stream-tail-pointer 'variable-documentation + "Points to last cons cell in parse-tag stream. We add items to tail of +parse-tag-stream instead of head.") + + (defvar w3-p-d-shortrefs) + (put 'w3-p-d-shortrefs 'variable-documentation + "An alist of the magic entity reference strings in the current +between-tags region and their replacements. Each item is of the format +\(REGEXP . REPLACEMENT-STRING\). Although in SGML shortrefs normally name +entities whose value should be used as the replacement, we have +preexpanded the entities for speed. We have also regexp-quoted the +strings to be replaced, so they can be used with looking-at. This should +never be in an element's overrides field unless +w3-p-d-shortref-chars is also in the field.") + + (defvar w3-p-d-shortref-chars) + (put 'w3-p-d-shortref-chars 'variable-documentation + "A string of the characters which can start shortrefs in the current +between-tags region. This must be in a form which can be passed to +skip-chars-forward and must contain exactly the characters which start the +entries in w3-p-d-shortrefs. If this variable is mentioned in the +overrides field of an element, its handling is magical in that the +variable w3-p-d-non-markup-chars is saved to the element's undo-list and +updated at the same time. This should never be in an element's overrides +field unless w3-p-d-shortrefs is also in the field.") + + (defvar w3-p-d-tag-name) + (put 'w3-p-d-tag-name 'variable-documentation + "Name of tag we are looking at, as an Emacs Lisp symbol. +Only non-nil when we are looking at a tag.") + + (defvar w3-p-d-end-tag-p) + (put 'w3-p-d-end-tag-p 'variable-documentation + "Is the tag we are looking at an end tag? +Only non-nil when we are looking at a tag.") + + ) + + +;;; +;;; HTML syntax error messages. +;;; + +(eval-when-compile + + (defvar w3-p-d-debug-url) + (put 'w3-p-d-debug-url 'variable-documentation + "Whether to print the URL being parsed before an error messages. +Only true for the first error message.") + + ;; The level parameter indicates whether the error is (1) very + ;; serious, must be displayed to all users, (2) invalid HTML, but the + ;; user should only be told if the user has indicated interest, or (3) + ;; valid HTML which is bad because it appears to rely on the way certain + ;; browsers will display it, which should only be displayed to the user + ;; if they have really asked for it. + + (defmacro w3-debug-html (&rest body) + "Emit a warning message. +These keywords may be used at the beginning of the arguments: + :mandatory-if sexp -- force printing if sexp evaluates non-nil. + :bad-style -- do not print unless w3-debug-html is 'style. + :outer -- do not include the current element in the element + context we report. + :nocontext -- do not include context where error detected. +The remaining parameters are treated as the body of a progn, the value of +which must be a string to use as the error message." + (let (mandatory-if bad-style outer nocontext condition) + (while (memq (car body) '(:mandatory-if :bad-style :outer :nocontext)) + (cond ((eq ':mandatory-if (car body)) + (setq mandatory-if (car (cdr body))) + (setq body (cdr (cdr body)))) + ((eq ':bad-style (car body)) + (setq bad-style t) + (setq body (cdr body))) + ((eq ':nocontext (car body)) + (setq nocontext t) + (setq body (cdr body))) + ((eq ':outer (car body)) + (setq outer t) + (setq body (cdr body))))) + (setq condition (if bad-style + '(eq 'style w3-debug-html) + 'w3-debug-html)) + (if mandatory-if + (setq condition + (` (or (, mandatory-if) + (, condition))))) + (` (if (, condition) + (let ((message (progn (,@ body)))) + (if message + (w3-debug-html-aux message + (,@ (if nocontext + (list outer nocontext) + (if outer '(t))))))))))) + + ;; This is unsatisfactory. + (put 'w3-debug-html 'lisp-indent-function 0) + + (put 'w3-debug-html 'edebug-form-spec + '([&rest &or ":nocontext" ":outer" [":mandatory-if" form] ":bad-style"] + &rest form)) + ) + +(defun w3-debug-html-aux (message &optional outer nocontext) + (let ( + ;; We have already determined whether the user should see the + ;; message, so don't let w3-warn suppress it. + (w3-debug-html t)) + ;; Print the URL before the first error message for a document. + (cond (w3-p-d-debug-url + (let ((url (url-view-url t))) + (w3-warn 'html + (if (or (null url) + (string-equal "" url)) + (format "HTML errors for buffer %s" + (current-buffer)) + (format "HTML errors for <URL:%s>" url)))) + (setq w3-p-d-debug-url nil))) + (w3-warn 'html + (if nocontext + message + (concat message + ;; Display context information for each error + ;; message. + "\n Containing elements: " + (w3-open-elements-string (if outer 1)) + (concat + "\n Text around error: " + (save-restriction + (widen) + (progn + (insert "*ERROR*") + (prog1 + (w3-quote-for-string + (buffer-substring + (max (- (point) 27) (point-min)) + (min (+ (point) 20) (point-max)))) + (delete-char -7)))))))))) + +(defun w3-quote-for-string (string) + (save-excursion + (set-buffer (get-buffer-create " w3-quote-whitespace")) + (erase-buffer) + (insert string) + (goto-char (point-min)) + (insert "\"") + (while (progn + (skip-chars-forward "^\"\\\t\n\r") + (not (eobp))) + (insert "\\" (cdr (assq (following-char) '((?\" . "\"") + (?\\ . "\\") + (?\t . "t") + (?\n . "n") + (?\r . "r"))))) + (delete-char 1)) + (insert "\"") + (buffer-string))) + + +;;; +;;; General entity references and numeric character references. +;;; + +;; *** MULE conversion? +;; *** I18N HTML support? + +(let ((html-entities w3-html-entities)) + (while html-entities + (put (car (car html-entities)) 'html-entity-expansion + (cons 'CDATA (if (integerp (cdr (car html-entities))) + (char-to-string (cdr (car html-entities))) + (cdr (car html-entities))))) + (setq html-entities (cdr html-entities)))) + +;; These are handled differently than the normal HTML entities because +;; we need to define the entities with 'nil instead of 'CDATA so +;; that they are correctly scanned for new markup. +;; +;; from jbw@cs.bu.edu +;; +;;> Of course, this differs from the specification a bit. The W3C tech +;;> report defines all of these as SYSTEM entities. This potentially means +;;> that they can be used in more contexts. The method I outlined above +;;> means "&smiley;" can only be used in contexts where IMG is a valid +;;> element. I am not sure exactly where it is okay to use a SYSTEM entity. +;;> I think anywhere that data characters are accepted. +;; +;; I find this acceptable, as just what the hell are you supposed to do with +;; &computer; as part of a value of a form input when you display it and/or +;; submit it?! + +(let ((html-entities w3-graphic-entities) + (cur nil)) + (while html-entities + (setq cur (car html-entities) + html-entities (cdr html-entities)) + (put (nth 0 cur) 'html-entity-expansion + (cons 'nil (format "img src=\"%s/%s%s\" alt=\"%s\"" + w3-icon-directory + (nth 1 cur) + (if w3-icon-format + (concat "." (symbol-name w3-icon-format)) + "") + (or (nth 3 cur) (nth 2 cur))))))) + +;; These are the general entities in HTML 3.0 in terms of which the math +;; shortrefs are defined: +;; +;; <!ENTITY REF1 STARTTAG "SUP"> +;; <!ENTITY REF2 ENDTAG "SUP"> +;; <!ENTITY REF3 STARTTAG "SUB"> +;; <!ENTITY REF4 ENDTAG "SUB"> +;; <!ENTITY REF5 STARTTAG "BOX"> +;; <!ENTITY REF6 ENDTAG "BOX"> +;; +;; We're ignoring them because these names should really be local to the +;; DTD and not visible in the document. They might change at any time in +;; future HTML standards. + +;; <!--Entities for language-dependent presentation (BIDI and contextual analysis) --> +;; <!ENTITY zwnj CDATA "‌"-- zero width non-joiner--> +;; <!ENTITY zwj CDATA "‍"-- zero width joiner--> +;; <!ENTITY lrm CDATA "‎"-- left-to-right mark--> +;; <!ENTITY rlm CDATA "‏"-- right-to-left mark--> + +;; Entity names are case sensitive! + +;; & should only be recognized when followed by letter or # and +;; digit or # and letter. + +(eval-when-compile + + (w3-p-s-var-def w3-p-s-entity) + (w3-p-s-var-def w3-p-s-pos) + (w3-p-s-var-def w3-p-s-num) + ;; Destroys free variables: + ;; w3-p-s-entity, w3-p-s-pos, w3-p-s-num + ;; Depends on case-fold-search being t. + (defsubst w3-expand-entity-at-point-maybe () + ;; We are looking at a &. + ;; Only &A or  or &#A syntax is special. + (cond + ((and (looking-at "&\\([a-z][-a-z0-9.]*\\)[\ ;\n]?") ; \n should be \r + ;; We are looking at a general entity reference, maybe undefined. + (setq w3-p-s-entity + (get + (intern (buffer-substring (match-beginning 1) (match-end 1))) + 'html-entity-expansion))) + + ;; If the reference was undefined, then for SGML, we should really + ;; issue a warning and delete the reference. However, the HTML + ;; standard (contradicting the SGML standard) says to leave the + ;; undefined reference in the text. + + ;; We are looking at a defined general entity reference. + (replace-match "") + (cond ((eq 'CDATA (car w3-p-s-entity)) + ;; Leave point after expansion so we don't rescan it. + (insert (cdr w3-p-s-entity))) + ((memq (car w3-p-s-entity) '(nil STARTTAG ENDTAG MS MD)) + ;; nil is how I mark ordinary entities. + ;; The replacement text gets rescanned for all of these. + (setq w3-p-s-pos (point)) + (insert (cdr (assq (car w3-p-s-entity) + '((nil . "") + (STARTTAG . "<") + (ENDTAG . "</") + (MS . "<![") + (MD . "<!")))) + (cdr w3-p-s-entity) + (cdr (assq (car w3-p-s-entity) + '((nil . "") + (STARTTAG . ">") + (ENDTAG . ">") + (MS . "]]>") + (MD . ">"))))) + (goto-char w3-p-s-pos) + ;; *** Strictly speaking, if we parse anything from the + ;; replacement text, it must end before the end of the + ;; replacement text. + ) + ((eq 'SDATA (car w3-p-s-entity)) + (insert "[Unimplemented SDATA \"%s\"]" (cdr w3-p-s-entity))) + ((eq 'PI (car w3-p-s-entity)) + ;; We are currently ignoring processing instructions. + ;; *** Strictly speaking, we should issue a warning if this + ;; occurs in a attribute value. + ) + (t + ;; *** We don't handle external entities yet. + (error "[Unimplemented entity: \"%s\"]" w3-p-s-entity)))) + + ((looking-at "&#[0-9][0-9]*\\([\ ;\n]?\\)") ; \n should be \r + ;; We are looking at a numeric character reference. + ;; Ensure the number is already terminated by a semicolon or carriage + ;; return so we can use "read" to get it as a number quickly. + (cond ((= (match-beginning 1) (match-end 1)) + ;; This is very uncommon, so we don't have to be quick here but + ;; rather correct. + (save-excursion + (goto-char (match-end 0)) ; same as match-end 1 + (insert ?\;)) + ;; Set up the match data properly + (looking-at "&#[0-9][0-9]*;"))) + (forward-char 2) + (setq w3-p-s-num (read (current-buffer))) + ;; Always leave point after the expansion of a numeric + ;; character reference, like it were a CDATA entity. + (replace-match "") + ;; char-to-string will hopefully do something useful with characters + ;; larger than 255. I think in MULE it does. Is this true? + ;; Bill wants to call w3-resolve-numeric-entity here, but I think + ;; that functionality belongs in char-to-string. + ;; The largest valid character in the I18N version of HTML is 65533. + ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt> + (insert (char-to-string w3-p-s-num))) + + ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r + (replace-match (assq (upcase (char-after (+ 3 (point)))) + '(;; *** Strictly speaking, record end should be + ;; carriage return. + (?E . "\n") ; RE + ;; *** And record start should be line feed. + (?S . "") ; RS + (?P . " ") ; SPACE + (?A . "\t")))) ; TAB + ;; Leave point after the expansion of a character reference, so it + ;; doesn't get rescanned. + ;; *** Strictly speaking, we should issue a warning for &#foo; if foo + ;; is not a function character in the SGML declaration. + ) + + ((eq ?& (following-char)) + ;; We are either looking at an undefined reference or a & that does + ;; not start a reference (in which case we should not have been called). + ;; Skip over the &. + (forward-char 1)) + + (t + ;; What is the code doing calling us if we're not looking at a "&"? + (error "this should never happen")))) + + ) + + +;;; +;;; Syntax table used in markup declarations. +;;; + +(defvar w3-sgml-md-syntax-table + (let ((table (make-syntax-table)) + (items '( + (0 "." 255) ; clear everything + (?\r " ") + (?\t " ") + (?\n " ") + (32 " ") ; space + (?< "\(>") + (?> "\)<") + (?\( "\(\)") + (?\) "\)\(") + (?\[ "\(\]") + (?\] "\)\[") + (?\" "\"") + (?\' "\"") + (?a "w" ?z) + (?A "w" ?Z) + (?0 "w" ?9) + (?. "w") + ;; "-" can be a character in a NAME, but it is also used in + ;; "--" as both a comment start and end within SGML + ;; declarations ("<!" ... ">"). In HTML, it is only used + ;; as a NAME character in the parameter entities + ;; Content-Type, HTTP-Method, and style-notations and in + ;; the attribute name http-equiv and in the notation names + ;; dsssl-lite and w3c-style. We would like to be able to + ;; train Emacs to skip over these kinds of comments with + ;; forward-sexp and backward-sexp. Is there any way to + ;; teach Emacs how to do this? It doesn't seem to be the + ;; case. + (?- "w") + ))) + (while items + (let* ((item (car items)) + (char (car item)) + (syntax (car (cdr item))) + (bound (or (car-safe (cdr-safe (cdr item))) + char))) + (while (<= char bound) + (modify-syntax-entry char syntax table) + (setq char (1+ char)))) + (setq items (cdr items))) + table) + "A syntax table for parsing SGML markup declarations.") + + +;;; +;;; Element information data type. +;;; + +;; The element information data type is used in two ways: +;; +;; * To store the DTD, there is one element record for each element in +;; the DTD. +;; +;; * To store information for open elements in the current parse tree. +;; Each such element is initialized by copying the element record +;; from the DTD. This means that values in the fields can not be +;; destructively altered, although of course the fields can be +;; changed. + +;; The cells in this vector are: +;; +;; name: the element's name (a generic identifier). +;; +;; end-tag-name: a symbol whose name should be the result of prefixing +;; the generic-identifier with a slash. This is a convenience value for +;; interfacing with the display engine which expects a stream of start +;; and end tags in this format rather than a tree. +;; +;; content-model: a data structure describing what elements or character +;; data we expect to find within this element. This is either a symbol +;; listed here: +;; +;; EMPTY: no content, no end-tag allowed. +;; CDATA: all data characters until "</[a-z]" is seen. +;; XCDATA: special non-SGML-standard mode which includes all data +;; characters until "</foo" is seen where "foo" is the name of this +;; element. (for XMP and LISTING) +;; XXCDATA: special non-SGML-standard mode which includes all data +;; until end-of-entity (end-of-buffer for us). (for PLAINTEXT) +;; RCDATA: all data characters until "</[a-z]" is seen, except that +;; entities are expanded first, although the expansions are not +;; scanned for end-tags. +;; XINHERIT: special non-SGML-standard mode which means to use the +;; content model of the containing element instead. +;; +;; or a vector of this structure: +;; +;; [(INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT) ...] +;; +;; where INCLUDES is of the format: +;; +;; (TAG ...) +;; +;; where each TRANSITION is one of these: +;; +;; (ACTION NEW-STATE ERRORP) +;; (ACTION NEW-STATE) +;; (ACTION) +;; +;; where DEFAULT is one of these: +;; +;; nil or TRANSITION +;; +;; where the meaning of the components is: +;; +;; INCLUDES is a list of tags for which the transition (*include *same +;; nil) applies. +;; +;; DEFAULT if non-nil is a transition that should be taken when +;; matching any possibility not explicitly listed in another +;; TRANSITION, except for data characters containing only whitespace. +;; +;; INCSPACEP specifies how to handle data characters which include +;; only whitespace characters. The value is non-nil to indicate +;; (*include *same nil) or nil to indicate (*discard *same nil). +;; +;; TAG is a symbol corresponding to the start-tag we are looking at, +;; or *data when seeing character data that includes at least one +;; non-space character. +;; +;; ACTION is one of: +;; *close: Close this element and try again using content model of +;; enclosing element. (Note that this does not apply to the +;; case of an element being closed by its own end-tag.) +;; *include: Process new element as subelement of this one or +;; include data characters directly. +;; *discard: Discard a start-tag or data characters. +;; *retry: Try again after processing NEW-STATE and ERRORP. +;; ELEMENT: Open ELEMENT (with default attributes), then try again +;; using its content model. +;; +;; NEW-STATE (optional, default *same) is the index of the state to +;; move to after processing the element or one of these: +;; *same: no state change occurs. +;; *next: change the the current state + 1. +;; The initial state is 0. NEW-STATE does not matter if ACTION is +;; *close. +;; +;; ERRORP (optional, default nil) if non-nil indicates this transition +;; represents an error. The error message includes this value if it +;; is a string. +;; +;; If no matching transition is found, the default transition is +;; (*discard *same "not allowed here"). +;; +;; overrides: An alist of pairs of the form (VAR REPLACEP . VALUE). +;; When this element is opened, the old value of VAR is saved in the +;; undo-list. If REPLACEP is non-nil, then VAR gets value VALUE, +;; otherwise VAR gets value (append VALUE (symbol-value VAR)). Useful +;; values for VAR are: +;; +;; w3-p-d-exceptions: See doc string. +;; +;; w3-p-d-shortrefs: See doc string. +;; +;; w3-p-d-shortref-chars: See doc string. +;; +;; end-tag-omissible: Whether it is legal to omit the end-tag of this +;; element. If an end-tag is inferred for an element whose end tag is +;; not omissible, an error message is given. +;; +;; state: The current state in the content model. Preset to the initial +;; state of 0. +;; +;; undo-list: an alist of of former values of local variables +;; of w3-parse-buffer to restore upon closing this element. Each +;; item on the list is of the format (VAR . VALUE-TO-RESTORE). +;; +;; attributes: an alist of attributes and values. Each item on +;; this list is of the format (ATTRIBUTE-NAME . VALUE). Each +;; ATTRIBUTE-NAME is a symbol and each attribute value is a +;; string. +;; +;; content: a list of the accumulated content of the element. While the +;; element is open, the list is in order from latest to earliest, +;; otherwise it is in order from earliest to latest. Each member is +;; either a string of data characters or a list of the form (NAME +;; ATTRIBUTES CONTENT), where NAME is the subelement's name, ATTRIBUTES +;; is an alist of the subelement's attribute names (lowercase symbols) +;; and their values (strings), and CONTENT is the subelement's content. + +(eval-when-compile + + (defconst w3-element-fields + '(name end-tag-name content-model state overrides undo-list + content attributes end-tag-omissible deprecated)) + + (let* ((fields w3-element-fields) + (index (1- (length fields)))) + (while fields + (let* ((field (symbol-name (car fields))) + (get-sym (intern (concat "w3-element-" field))) + (set-sym (intern (concat "w3-set-element-" field)))) + (eval (` (progn + (defmacro (, get-sym) (element) + (list 'aref element (, index))) + (defmacro (, set-sym) (element value) + (list 'aset element (, index) value)))))) + (setq fields (cdr fields)) + (setq index (1- index)))) + + (defmacro w3-make-element () + (list 'make-vector (length w3-element-fields) nil)) + + ;; *** move this to be with DTD declaration. + (defmacro w3-fresh-element-for-tag (tag) + (` (copy-sequence + (or (get (, tag) 'html-element-info) + (error "unimplemented element %s" + (w3-sgml-name-to-string (, tag))))))) + + ;; *** move this to be with DTD declaration. + (defmacro w3-known-element-p (tag) + (` (get (, tag) 'html-element-info))) + + (defsubst w3-sgml-name-to-string (sym) + (upcase (symbol-name sym))) + + ) + + +;;; +;;; Parse tree manipulation. +;;; + +;; ;; Find the name of the previous element or a substring of the +;; ;; preceding data characters. +;; (let ((content (w3-element-content (car stack)))) +;; (while content +;; (cond +;; ((and (stringp (car content)) +;; (not (string-match "\\`[ \t\n\r]*\\'" (car content)))) +;; (setq prior-item (car content)) +;; ;; Trim trailing whitespace +;; (if (string-match "\\(.*[^ \t\n\r]\\)[ \t\n\r]*\\'" prior-item) +;; (setq prior-item (substring prior-item 0 (match-end 1)))) +;; (if (> (length prior-item) 8) +;; (setq prior-item (concat "..." (substring prior-item -8)))) +;; (setq prior-item (w3-quote-for-string prior-item)) +;; (setq prior-item (concat "\(after " prior-item "\)")) +;; (setq content nil)) +;; ((and (consp (car content)) +;; (symbolp (car (car content)))) +;; (setq prior-item +;; (concat "\(after " +;; (w3-sgml-name-to-string (car (car content))) +;; "\)")) +;; (setq content nil)) +;; (t +;; (setq content (cdr content)))))) + +;; Only used for HTML debugging. +(defun w3-open-elements-string (&optional skip-count) + (let* ((stack (nthcdr (or skip-count 0) + (cons w3-p-d-current-element + w3-p-d-open-element-stack))) + ;;(prior-item "(at start)") + result) + ;; Accumulate the names of the enclosing elements. + (while stack + (let ((element (w3-element-name (car stack)))) + (if (eq '*holder element) + nil + ;; Only include *DOCUMENT if there are no other elements. + (if (or (not (eq '*document element)) + (null result)) + (setq result (cons (w3-sgml-name-to-string element) + result))))) + (setq stack (cdr stack))) + (setq result (mapconcat 'identity result ":")) + (if result + ;;(concat + result + ;; prior-item) + "[nowhere!]"))) + +;; *** This doesn't really belong here, but where? +(eval-when-compile + (defmacro w3-invalid-sgml-chars () + "Characters not allowed in an SGML document using the reference +concrete syntax (i.e. HTML). Returns a string in the format expected by +skip-chars-forward." + "\000-\010\013\014\016-\037\177-\237")) + +(eval-when-compile + ;; Uses: + ;; w3-p-d-null-end-tag-enabled, w3-p-d-in-parsed-marked-section, + ;; w3-p-d-shortref-chars + ;; Modifies free variable: + ;; w3-p-d-non-markup-chars + (defsubst w3-update-non-markup-chars () + (setq w3-p-d-non-markup-chars + (concat "^&<" + (w3-invalid-sgml-chars) + (if w3-p-d-null-end-tag-enabled "/" "") + (if w3-p-d-in-parsed-marked-section "]" "") + (or w3-p-d-shortref-chars "")))) + + ;; Modifies free variable: + ;; w3-p-d-parse-tag-stream-tail-pointer + (defsubst w3-add-display-item (tag value) + (setcdr w3-p-d-parse-tag-stream-tail-pointer + (list (cons tag value))) + (setq w3-p-d-parse-tag-stream-tail-pointer + (cdr w3-p-d-parse-tag-stream-tail-pointer))) + +) + +(eval-when-compile + (w3-p-s-var-def w3-p-s-overrides) + (w3-p-s-var-def w3-p-s-undo-list) + (w3-p-s-var-def w3-p-s-var) + ;; Uses free variables: + ;; w3-p-d-non-markup-chars + ;; Modifies free variables: + ;; w3-p-d-current-element, w3-p-d-open-element-stack + ;; Destroys free variables: + ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var + (defsubst w3-open-element (tag attributes) + + ;; Send trailing data character item in the old current element to + ;; display engine. + (if (stringp (car-safe (w3-element-content w3-p-d-current-element))) + (w3-add-display-item + 'text + (car-safe (w3-element-content w3-p-d-current-element)))) + + ;; Push new element on stack. + (setq w3-p-d-open-element-stack (cons w3-p-d-current-element + w3-p-d-open-element-stack)) + (setq w3-p-d-current-element (w3-fresh-element-for-tag tag)) + + ;; Warn if deprecated or obsolete. + (if (w3-element-deprecated w3-p-d-current-element) + (w3-debug-html :outer + (format "%s element %s." + (if (eq 'obsolete + (w3-element-deprecated w3-p-d-current-element)) + "Obsolete" + "Deprecated") + (w3-sgml-name-to-string + (w3-element-name w3-p-d-current-element))))) + + ;; Store attributes. + ;; *** we are not handling #CURRENT attributes (HTML has none). + (w3-set-element-attributes w3-p-d-current-element attributes) + ;; *** Handle default attribute values. + ;; *** Fix the attribute name for unnamed values. Right now they will + ;; be in the attribute list as items of the format (VALUE . VALUE) where + ;; both occurrences of VALUE are the same. The first one needs to be + ;; changed to the proper attribute name by consulting the DTD. + ;; ******************** + + ;; Handle syntax/semantics overrides of new current element. + (cond ((w3-element-overrides w3-p-d-current-element) + (setq w3-p-s-overrides + (w3-element-overrides w3-p-d-current-element)) + (setq w3-p-s-undo-list nil) + (while w3-p-s-overrides + (setq w3-p-s-var (car (car w3-p-s-overrides))) + (setq w3-p-s-undo-list + (cons (cons w3-p-s-var + (symbol-value w3-p-s-var)) + w3-p-s-undo-list)) + (set w3-p-s-var (if (car (cdr (car w3-p-s-overrides))) + (cdr (cdr (car w3-p-s-overrides))) + (append (cdr (cdr (car w3-p-s-overrides))) + (symbol-value w3-p-s-var)))) + ;; *** HACK HACK. + ;; Magic handling of w3-p-d-shortref-chars. + (cond ((eq 'w3-p-d-shortref-chars w3-p-s-var) + (setq w3-p-s-undo-list + (cons (cons 'w3-p-d-non-markup-chars + w3-p-d-non-markup-chars) + w3-p-s-undo-list)) + (w3-update-non-markup-chars))) + (setq w3-p-s-overrides (cdr w3-p-s-overrides))) + (w3-set-element-undo-list w3-p-d-current-element + w3-p-s-undo-list))) + + ;; Handle content-model inheritance. (Very non-SGML!) + (if (eq 'XINHERIT (w3-element-content-model w3-p-d-current-element)) + (w3-set-element-content-model + w3-p-d-current-element + (w3-element-content-model (car w3-p-d-open-element-stack)))) + + ;; Send the start-tag and attributes to the display engine. + (if (memq tag '(plaintext style xmp textarea)) + ;; Garbage special-casing for old display engine. + ;; Nothing is sent until end-tag is found. + ;; The DTD will ensure no subelements of these elements. + nil + ;; Normal procedure. + (w3-add-display-item tag attributes))) + ) + +;; The protocol for handing items to the display engine is as follows. +;; +;; For an element, send (START-TAG . ATTS), each member of the content, +;; and (END-TAG . nil) if the element is allowed to have an end tag. +;; +;; For data characters, send (text . DATA-CHARACTERS). +;; +;; Exceptions: +;; +;; For PLAINTEXT, STYLE, XMP, TEXTAREA send: +;; (START-TAG . ((data . DATA-CHARACTERS) . ATTS)). +;; +;; *** This requires somehow eliminating any subelements of the TEXTAREA +;; element. TEXTAREA can contain subelements in HTML 3.0. +;; +;; For LISTING, send (text . DATA-CHARACTERS). (Is this really correct or +;; is this perhaps a bug in the old parser?) I'm ignoring this for now. + +(eval-when-compile + (w3-p-s-var-def w3-p-s-undo-list) + (w3-p-s-var-def w3-p-s-content) + (w3-p-s-var-def w3-p-s-end-tag) + ;; Modifies free variables: + ;; w3-p-d-current-element, w3-p-d-open-element-stack + ;; Accesses free variables: + ;; w3-p-d-tag-name, w3-p-d-end-tag-p + ;; Destroys free variables: + ;; w3-p-s-undo-list, w3-p-s-content, w3-p-s-end-tag + (defsubst w3-close-element (&optional inferred) + ;; inferred: non-nil if the end-tag of the current element is being + ;; inferred due to the presence of content not allowed in the current + ;; element. If t, then the tag causing this is in w3-p-d-tag-name and + ;; w3-p-d-end-tag-p. + ;; (OLD: ... otherwise it is a symbol indicating the start-tag + ;; of an element or *data or *space indicating data characters.) + + (cond ((and inferred + (not (w3-element-end-tag-omissible w3-p-d-current-element))) + (w3-debug-html + (format "</%s> end-tag not omissible (required due to %s)" + (w3-sgml-name-to-string + (w3-element-name w3-p-d-current-element)) + (cond ((eq t inferred) + (format (if w3-p-d-end-tag-p + "</%s> end-tag" + "start-tag for %s") + (w3-sgml-name-to-string + w3-p-d-tag-name))) + ;; *** Delete this functionality? + ((memq inferred '(*space *data)) + "data characters") + ((symbolp inferred) + (format "start-tag for %s" + (w3-sgml-name-to-string inferred))) + ))))) + + ;; Undo any variable bindings of this element. + (cond ((w3-element-undo-list w3-p-d-current-element) + (setq w3-p-s-undo-list + (w3-element-undo-list w3-p-d-current-element)) + (while w3-p-s-undo-list + (set (car (car w3-p-s-undo-list)) + (cdr (car w3-p-s-undo-list))) + (setq w3-p-s-undo-list (cdr w3-p-s-undo-list))))) + + (setq w3-p-s-end-tag + (w3-element-end-tag-name w3-p-d-current-element)) + + ;; Fix up the content of the current element in preparation for putting + ;; it in the parent. + ;; Remove trailing newline from content, if there is one, otherwise send + ;; any trailing data character item to display engine. + (setq w3-p-s-content (w3-element-content w3-p-d-current-element)) + (cond ((null w3-p-s-content)) + ((equal "\n" (car w3-p-s-content)) + (setq w3-p-s-content (cdr w3-p-s-content))) + ((and (stringp (car w3-p-s-content)) + ;; Garbage special-casing for old display engine. + (not (memq w3-p-s-end-tag + '(/plaintext /style /xmp /textarea)))) + (w3-add-display-item 'text (car w3-p-s-content)))) + + ;; Send the end-tag to the display engine, but only if the element is + ;; allowed to have an end tag. + (cond ((memq w3-p-s-end-tag '(/plaintext /style /xmp /textarea)) + ;; Garbage special-casing for old display engine. + ;; Format old display engine expects for these elements: + ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTRIBUTES)) + (w3-add-display-item + ;; Use the *start*-tag, not the end-tag. + (w3-element-name w3-p-d-current-element) + (cons (cons 'data + (condition-case nil + (mapconcat 'identity w3-p-s-content "") + (error "eeek! subelement content!"))) + (w3-element-attributes w3-p-d-current-element)))) + ;; *** Handle LISTING the way the old parser did. + ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element)) + ;; Do nothing, can't have an end tag. + ) + (t + ;; Normal case. + (w3-add-display-item w3-p-s-end-tag nil) + (if (null w3-p-s-content) + (w3-debug-html + :bad-style :outer + ;; Don't warn for empty TD elements or empty A elements + ;; with no HREF attribute. + ;; *** Crude hack that should really be encoded in the + ;; element database somehow. + (if (or (not (memq (w3-element-name w3-p-d-current-element) + '(a td))) + (assq 'href + (w3-element-attributes w3-p-d-current-element))) + (format "Empty %s element." + (w3-sgml-name-to-string + (w3-element-name w3-p-d-current-element)))))))) + + ;; Put the current element in the proper place in its parent. + ;; This will cause an error if we overpop the stack. + (w3-set-element-content + (car w3-p-d-open-element-stack) + (cons (list (w3-element-name w3-p-d-current-element) + (w3-element-attributes w3-p-d-current-element) + (nreverse w3-p-s-content)) + (w3-element-content (car w3-p-d-open-element-stack)))) + + ;; Pop the stack. + (setq w3-p-d-current-element (car w3-p-d-open-element-stack)) + (setq w3-p-d-open-element-stack (cdr w3-p-d-open-element-stack))) + + ) + + +;;; +;;; A pseudo-DTD for HTML. +;;; + +(eval-when-compile + ;; This works around the following bogus compiler complaint: + ;; While compiling the end of the data in file w3-parse.el: + ;; ** the function w3-expand-parameters is not known to be defined. + ;; This is a bogus error. Anything of this form will trigger this message: + ;; (eval-when-compile (defun xyzzy () (xyzzy))) + (defun w3-expand-parameters (pars data) nil)) + +(eval-when-compile + (defun w3-expand-parameters (pars data) + (cond ((null data) + nil) + ((consp data) + ;; This has to be written carefully to avoid exceeding the + ;; maximum lisp function call nesting depth. + (let (result) + (while (consp data) + (let ((car-exp (w3-expand-parameters pars (car data)))) + (setq result + (if (and (symbolp (car data)) + (not (eq car-exp (car data))) + ;; An expansion occurred. + (listp car-exp)) + ;; The expansion was a list, which we splice in. + (condition-case err + (append (reverse car-exp) result) + (wrong-type-argument + (if (eq 'listp (nth 1 err)) + ;; Wasn't really a "list" since the last + ;; cdr wasn't nil, so don't try to splice + ;; it in. + (cons car-exp result) + (signal (car err) (cdr err))))) + (cons car-exp result)))) + (setq data (cdr data))) + (append (nreverse result) + (w3-expand-parameters pars data)))) + ((symbolp data) + (let ((sym-exp (cdr-safe (assq data pars)))) + (if sym-exp + (w3-expand-parameters pars sym-exp) + data))) + ((vectorp data) + (let ((i 0) + (result (copy-sequence data))) + (while (< i (length data)) + (aset result i + (w3-expand-parameters pars (aref data i))) + (setq i (1+ i))) + result)) + (t + data)))) + +(eval-when-compile + (defun w3-unfold-dtd (items) + (let (result) + (while items + (let* ((item (car items)) + (names (car item)) + (content-model + (or (cdr-safe (assq 'content-model item)) + (error "impossible"))) + (overrides (cdr-safe (assq 'overrides item))) + (end-tag-omissible + (or (cdr-safe (assq 'end-tag-omissible item)) + ;; *** Is this SGML standard? + (eq 'EMPTY content-model))) + (deprecated (cdr-safe (assq 'deprecated item))) + element + name) + (while names + (setq name (car names)) + (setq names (cdr names)) + + ;; Create and initialize the element information data + ;; structure. + (setq element (w3-make-element)) + (w3-set-element-name element name) + (w3-set-element-end-tag-name + element + (intern (concat "/" (symbol-name name)))) + (w3-set-element-state element 0) + (w3-set-element-content-model element content-model) + (w3-set-element-end-tag-omissible element end-tag-omissible) + + (or (memq deprecated '(nil t obsolete)) + (error "impossible")) + (w3-set-element-deprecated element deprecated) + + ;; Inclusions and exclusions are specified differently in the + ;; human-coded DTD than in the format the implementation uses. + ;; The human-coded version is designed to be easy to edit and to + ;; work with w3-expand-parameters while the internal version is + ;; designed to be fast. We have to translate here. This work + ;; is repeated for every element listed in `names' so that the + ;; exclusion exception error messages can be accurate. + (let ((inclusions (cdr-safe (assq 'inclusions item))) + (exclusions (cdr-safe (assq 'exclusions item))) + (exclusion-mode '*close) + (exclusion-message + (format "%s exclusion" (w3-sgml-name-to-string name))) + exceptions) + (while inclusions + (setq exceptions (cons (cons (car inclusions) + '(*include *same nil)) + exceptions)) + (setq inclusions (cdr inclusions))) + (while exclusions + (cond ((memq (car exclusions) '(*discard *include *close)) + (setq exclusion-mode (car exclusions))) + ((stringp (car exclusions)) + (setq exclusion-message (car exclusions))) + (t + (setq exceptions (cons (list (car exclusions) + exclusion-mode + '*same + exclusion-message) + exceptions)))) + (setq exclusions (cdr exclusions))) + (let ((overrides (if exceptions + (cons (cons 'w3-p-d-exceptions + (cons nil exceptions)) + overrides) + overrides))) + (w3-set-element-overrides element overrides))) + + (setq result (cons (cons name element) result)))) + (setq items (cdr items))) + result))) + +;; Load the HTML DTD. +;; <URL:ftp://ds.internic.net/rfc/rfc1866.txt> +;; *** Be sure to incorporate rfc1867 when attribute-checking is added. +;; *** Write function to check sanity of the content-model forms. +;; *** I18N: Add Q, BDO, SPAN +(mapcar + (function + (lambda (pair) + (put (car pair) 'html-element-info (cdr pair)))) + ;; The purpose of this complexity is to speed up loading by + ;; pre-evaluating as much as possible at compile time. + (eval-when-compile + (w3-unfold-dtd + (w3-expand-parameters + '( + (%headempty . (link base meta range)) + (%head-deprecated . (nextid)) + + ;; client-side imagemaps + (%imagemaps . (area map)) + ;; special action is taken for %text inside %body.content in the + ;; content model of each element. + (%body.content . (%heading %block hr div address %imagemaps)) + + (%heading . (h1 h2 h3 h4 h5 h6)) + + ;; Netscape's CENTER, FONT, and BASEFONT are handled + ;; non-standardly. In actual psuedo-HTML on the net, these are + ;; used as both text-level constructs and block-level constructs. + ;; They are the only items in both %block and %text in this + ;; definition here. + ;; *** Perhaps add BLINK here too? But no one uses that as a block + ;; construct. What about NOBR? + (%netscape-crud . (center font basefont)) + + ;; Emacs-w3 extensions + (%emacsw3-crud-nonempty . (roach secret wired)) + (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek)) + + (%block . (p %list dl form %preformatted %netscape-crud font + %blockquote isindex fn table fig note + %block-deprecated %block-obsoleted)) + (%list . (ul ol)) + (%preformatted . (pre)) + (%blockquote . (bq)) + (%block-deprecated . (dir menu blockquote)) + (%block-obsoleted . (xmp listing)) + + ;; Why is IMG in this list? + (%pre.exclusion . (*include img *discard tab math big small sub sup)) + + (%text . (*data b %notmath %netscape-crud sub sup + %emacsw3-crud %emacsw3-crud-nonempty)) + (%notmath . (%special %font %phrase %misc)) + (%font . (i u s strike tt big small)) ; B left out for MATH handling + (%phrase . (em strong code samp kbd var cite blink)) + ;; Don't know if this is right place for EMBED. + (%special . (a img br wbr nobr tab math embed)) + (%misc . (q lang au dfn person acronym abbrev ins del)) + + (%formula . (*data %math)) + (%math . (box above below %mathvec root sqrt array sub sup + %mathface)) + (%mathvec . (vec bar dot ddot hat tilde)) + (%mathface . (b t bt)) + + (%mathdelims . (over atop choose left right of)) + + ;; What the hell? This takes BODYTEXT????? No way! + (%bq-content-model . [(nil + nil + (((bodytext) *include *next)) + (bodytext *next)) + (nil + nil + (((credit) *include *next)) + nil) + (nil nil nil nil) + ]) + + ;; non-default bad HTML handling. + (%in-text-ignore . ((p %heading) *discard *same error)) + ) + '( + ;; A dummy element that will contain *document. + ((*holder) + (content-model . [(nil nil nil nil)])) + ;; The root of the parse tree. We start with a pseudo-element + ;; named *document for convenience. + ((*document) + (content-model . [(nil nil (((html) *include *next)) (html *next)) + (nil + nil + nil + (*include *same "after document end"))]) + (end-tag-omissible . t)) + ;; HTML O O (HEAD, BODY) + ((html) + (content-model . [(nil + nil + (((head) *include *next)) + (head *next)) + (nil + nil + (((body) *include *next) + ;; Netscape stuff + ((frameset) *include 4) + ) + (body *next)) + (nil + nil + (((plaintext) *include *next)) + (*retry *next)) + (nil + nil + nil + (*include *same "after BODY")) + (nil + nil + nil + (*include *same "after FRAMESET")) + ]) + (end-tag-omissible . t)) + ((head) + (content-model . [((title isindex %headempty style %head-deprecated) + nil + nil + ;; *** Should only close if tag can + ;; legitimately follow head. So many can that + ;; I haven't bothered to enumerate them. + (*close))]) + (end-tag-omissible . t)) + ;; TITLE - - (#PCDATA) + ((title) + (content-model . RCDATA ; not official + ;; [((*data) include-space nil nil)] + )) + ;; STYLE - O (#PCDATA) + ;; STYLE needs to be #PCDATA to allow omitted end tag. Bleagh. + ((style) + (content-model . [((*data) + include-space + nil + ;; *** Should only close if tag can + ;; legitimately follow style. So many can that + ;; I haven't bothered to enumerate them. + (*close))]) + (end-tag-omissible . t)) + ((body) + (content-model . [((banner) nil nil (*retry *next)) + ((bodytext) nil nil (bodytext *next)) + (nil nil (((plaintext) *close)) nil)]) + (inclusions . (spot)) + (end-tag-omissible . t)) + ;; Do I really want to include BODYTEXT? It has something to do + ;; with mixed content screwing things up, and I don't understand + ;; it. Wait! It's used by BQ! + ((bodytext) + (content-model . [((%body.content) + nil + ;; Push <P> before data characters. Non-SGML. + (((%text) p) + ;; Closing when seeing CREDIT is a stupidity + ;; caused by BQ's sharing of BODYTEXT. BQ + ;; should have its own BQTEXT. + ((credit plaintext) *close)) + nil)]) + (end-tag-omissible . t)) + ((div banner) + (content-model . [((%body.content) + nil + ;; Push <P> before data characters. Non-SGML. + (((%text) p)) + nil)])) + ((address) + (content-model . [((p) + nil + ;; Push <P> before data characters. Non-SGML. + (((%text) p)) + nil)])) + ((%heading) + (content-model . [((%text) + include-space + ((%in-text-ignore)) + nil)])) + ((p) + (content-model . [((%text) + include-space + nil + ;; *** Should only close if tag can + ;; legitimately follow P. So many can that I + ;; don't bother to enumerate here. + (*close))]) + (end-tag-omissible . t)) + ((ul ol) + (content-model . [((lh) + nil + (((li) *include *next)) + (*retry *next)) + ((li) + nil + ;; Push <LI> before data characters or block + ;; elements. + ;; Non-SGML. + (((%text %block) li *same error)) + nil)])) + ((lh) + (content-model . [((%text) + include-space + (((dd dt li) *close) + (%in-text-ignore)) + nil)]) + (end-tag-omissible . t)) + ((dir menu) + (content-model . [((li) + nil + (((%text) li *same error)) + nil)]) + (exclusions . (%block)) + (deprecated . t)) + ((li) + (content-model . [((%block) + nil + (((li) *close) + ;; Push <P> before data characters. Non-SGML. + ((%text) p)) + nil)]) + (end-tag-omissible . t) + ;; Better bad HTML handling. + ;; Technically, there are a few valid documents that this will + ;; hose, because you can have H1 inside FORM inside LI. However, + ;; I don't think that should be allowed anyway. + (exclusions . (*discard "not allowed here" %heading))) + ((dl) + (content-model . [((lh) + nil + (((dt dd) *include *next)) + (*retry *next)) + ((dt dd) + nil + ;; Push <DD> before data characters or block + ;; items. + ;; Non-SGML. + (((%text %block) dd *same error)) + nil)])) + ((dt) + (content-model . [((%text) + include-space + (((dd dt) *close) + (%in-text-ignore)) + nil)]) + (end-tag-omissible . t)) + ;; DD is just like LI, but we treat it separately because it can be + ;; followed by a different set of elements. + ((dd) + (content-model . [((%block) + nil + (((dt dd) *close) + ;; Push <P> before data characters. Non-SGML. + ((%text) p)) + nil)]) + (end-tag-omissible . t) + ;; See comment with LI. + (exclusions . (*discard "not allowed here" %heading))) + ((pre) + (content-model . [((%text hr) + include-space + ((%in-text-ignore)) + nil)]) + (exclusions . (%pre.exclusion))) + ;; BLOCKQUOTE deprecated, BQ okay + ((bq) + (content-model . %bq-content-model)) + ((blockquote) + (content-model . %bq-content-model) + ;; BLOCKQUOTE is deprecated in favor of BQ in the HTML 3.0 DTD. + ;; However, BQ is not even mentioned in the HTML 2.0 DTD. So I + ;; don't think we can enable this yet: + ;;(deprecated . t) + ) + ((fn note) + (content-model . [((%body.content) + nil + ;; Push <P> before data characters. Non-SGML. + (((%text) p)) + nil)])) + ((fig) + (content-model . [((overlay) nil nil (*retry *next)) + (nil + nil + (((caption) *include *next)) + (*retry *next)) + (nil + nil + (((figtext) *include *next) + ((credit) *retry *next)) + ;; *** Should only do this for elements that + ;; can be in FIGTEXT. + (figtext *next)) + (nil nil (((credit) *include *next)) nil) + (nil nil nil nil)])) + ((caption credit) + (content-model . [((%text) + nil + ((%in-text-ignore)) + nil)])) + ((figtext) + (content-model . [((%body.content) + nil + ;; Push <P> before data characters. Very non-SGML. + (((%text) p) + ((credit) *close)) + nil)]) + (end-tag-omissible . t)) + ((%emacsw3-crud) + (content-model . EMPTY)) + ((%netscape-crud) + ;; Special non-SGML treatment of Netscape's shit. + (content-model . XINHERIT)) + ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA) + ((form) + ;; Same as BODY. Ugh! + (content-model . [((%body.content) + nil + ;; Push <P> before data characters. Non-SGML. + (((%text) p)) + nil)]) + (exclusions . (form)) + (inclusions . (input select textarea keygen label))) + ;; *** Where is the URL describing this? + ((label) + (content-model . [((%text) + include-space + nil + nil)]) + ;; *** These are already included, no need to repeat. + ;;(inclusions . (input select textarea)) + ;; *** Is a LABEL allowed inside a LABEL? I assume no. + (exclusions . (label)) + ;; The next line just does the default so is unneeded: + ;;(end-tag-omissible . nil) + ) + ;; SELECT - - (OPTION+) -(INPUT|KEYGEN|TEXTAREA|SELECT)> + ;; *** This should be -(everything). + ((select) + (content-model . [((option) nil nil nil)]) + (exclusions . (input label select keygen textarea))) + ;; option - O (#PCDATA) + ;; needs to be #PCDATA to allow omitted end tag. + ((option) + ;; I'd like to make this RCDATA to avoid problems with inclusions + ;; like SPOT, but that would conflict with the omitted end-tag, I + ;; think. + (content-model . [((*data) + include-space + (((option) *close)) + nil)]) + (end-tag-omissible . t)) + ;; TEXTAREA - - (#PCDATA) -(INPUT|TEXTAREA|KEYGEN|SELECT) + ((textarea) + ;; Same comment as for OPTION about RCDATA. + (content-model . [((*data) include-space nil nil)]) + (exclusions . (input select label keygen textarea))) + ((hr br img isindex input keygen overlay wbr spot tab + %headempty %mathdelims) + (content-model . EMPTY)) + ((nextid) + (content-model . EMPTY) + (deprecated . t)) + ((a) + (content-model . [((%text) + include-space + (((%heading) + *include *same "deprecated inside A") + ;; *** I haven't made up my mind whether this + ;; is a good idea. It can result in a lot of + ;; bad formatting if the A is *never* closed. + ;;((p) *discard *same error) + ) + nil)]) + (exclusions . (a))) + ((b %font %phrase %misc nobr %emacsw3-crud-nonempty) + (content-model . [((%text) + include-space + ((%in-text-ignore)) + nil)])) + ((plaintext) + (content-model . XXCDATA) + (end-tag-omissible . t) + (deprecated . obsolete)) + ((xmp listing) + (content-model . XCDATA) + (deprecated . obsolete)) + ;; Latest table spec (as of Nov. 13 1995) is at: + ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-tables-03.txt> + ((table) + (content-model . [(nil + nil + (((caption) *include *next) + ((col colgroup thead tfoot tbody tr) *retry *next)) + (*retry *next)) ;error handling + ((col colgroup) + nil + (((thead tfoot tbody tr) *retry *next)) + (*retry *next)) ;error handling + (nil + nil + (((thead) *include *next) + ((tfoot tbody tr) *retry *next)) + (*retry *next)) ;error handling + (nil + nil + (((tfoot) *include *next) + ((tbody tr) *retry *next)) + (*retry *next)) ;error handling + ((tbody) + nil + (((tr) tbody *same) + ;; error handling + ((%body.content) tbody *same error)) + nil)])) + ((colgroup) + (content-model . [((col) + nil + (((colgroup thead tfoot tbody tr) *close)) + nil)]) + (end-tag-omissible . t)) + ((col) + (content-model . EMPTY)) + ((thead) + (content-model . [((tr) + nil + (((tfoot tbody) *close) + ;; error handling + ((%body.content) tr *same error)) + nil)]) + (end-tag-omissible . t)) + ((tfoot tbody) + (content-model . [((tr) + nil + (((tbody) *close) + ;; error handling + ((%body.content) tr *same error)) + nil)]) + (end-tag-omissible . t)) + ((tr) + (content-model . [((td th) + nil + (((tr tfoot tbody) *close) + ;; error handling + ((%body.content) td *same error)) + nil)]) + (end-tag-omissible . t)) + ((td th) + ;; Arrgh! Another %body.content!!! Stupid!!! + (content-model . [((%body.content) + nil + (((td th tr tfoot tbody) *close) + ;; Push <P> before data characters. Non-SGML. + ((%text) p)) + nil)]) + (end-tag-omissible . t)) + ((math) + (content-model . [((*data) include-space nil nil)]) + (overrides . + ((w3-p-d-shortref-chars t . "\{_^") + (w3-p-d-shortrefs t . (("\\^" . "<sup>") + ("_" . "<sub>") + ("{" . "<box>"))))) + (inclusions . (%math)) + (exclusions . (%notmath))) + ((sup) + (content-model . [((%text) + include-space + ((%in-text-ignore)) + nil)]) + (overrides . + ((w3-p-d-shortref-chars t . "\{_^") + (w3-p-d-shortrefs t . (("\\^" . "</sup>") + ("_" . "<sub>") + ("{" . "<box>")))))) + ((sub) + (content-model . [((%text) + include-space + ((%in-text-ignore)) + nil)]) + (overrides . + ((w3-p-d-shortref-chars t . "\{_^") + (w3-p-d-shortrefs t . (("\\^" . "<sup>") + ("_" . "</sub>") + ("{" . "<box>")))))) + ((box) + (content-model . [((%formula) + include-space + (((left) *include 1) + ((over atop choose) *include 2) + ((right) *include 3)) + nil) + ((%formula) + include-space + (((over atop choose) *include 2) + ((right) *include 3)) + nil) + ((%formula) + include-space + (((right) *include 3)) + nil) + ((%formula) include-space nil nil)]) + (overrides . + ((w3-p-d-shortref-chars t . "{}_^") + (w3-p-d-shortrefs t . (("\\^" . "<sup>") + ("_" . "<sub>") + ("{" . "<box>") + ("}" . "</box>")))))) + ((above below %mathvec t bt sqrt) + (content-model . [((%formula) include-space nil nil)])) + ;; ROOT has a badly-specified content-model in HTML 3.0. + ((root) + (content-model . [((%formula) + include-space + (((of) *include *next)) + nil) + ((%formula) include-space nil nil)])) + ((of) + (content-model . [((%formula) include-space nil nil)]) + ;; There is no valid way to infer a missing end-tag for OF. This + ;; is bizarre. + (end-tag-omissible . t)) + ((array) + (content-model . [((row) nil nil nil)])) + ((row) + (content-model . [((item) nil (((row) *close)) nil)]) + (end-tag-omissible . t)) + ((item) + (content-model . [((%formula) + include-space + (((row item) *close)) + nil)]) + (end-tag-omissible . t)) + ;; The old parser would look for the </EMBED> end-tag and include + ;; the contents between <EMBED> and </EMBED> as the DATA attribute + ;; of the EMBED start-tag. However, it did not require the + ;; </EMBED> end-tag and did nothing if it was missing. This is + ;; completely impossible to specify in SGML. + ;; + ;; See + ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0603.html> + ;; + ;; Questions: Does EMBED require the end-tag? How does NOEMBED fit + ;; into this? Where can EMBED appear? + ;; + ;; Nov. 25 1995: a new spec for EMBED (also an I-D): + ;; <URL:http://www.cs.princeton.edu/~burchard/www/interactive/> + ;; + ;; Here is my guess how to code EMBED: + ((embed) + (content-model . [((noembed) nil nil (*close))])) + ((noembed) + (content-model . [((%body.content) ; hack hack hack + nil + (((%text) p)) + nil)])) + ;; + ;; FRAMESET is a Netscape thing. + ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0588.html> + ((frameset) + (content-model . [((noframes frame frameset) nil nil nil)])) + ((noframes) + (content-model . [((%body.content) + nil + ;; Push <P> before data characters. Non-SGML. + (((%text) p)) + nil)])) + ((frame) + (content-model . EMPTY)) + ;; + ;; APPLET is a Java thing. + ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README> + ((applet) + ;; I really don't want to add another ANY content-model. + (content-model . XINHERIT) + (inclusions . (param))) + ((param) + (content-model . EMPTY)) + ;; backward compatibility with old Java. + ((app) + (content-model . EMPTY)) + ;; Client-side image maps. + ;; <URL:ftp://ds.internic.net/internet-drafts/draft-seidman-clientsideimagemap-01.txt> + ;; *** The only problem is that I don't know in what elements MAP + ;; can appear, so none of this is reachable yet. + ((map) + (content-model . [((area) nil nil nil)])) + ((area) + (content-model . EMPTY)) + ))))) + + +;;; +;;; Omitted tag inference using state transition tables. +;;; + +(eval-when-compile + + (w3-p-s-var-def w3-p-s-includep) + (w3-p-s-var-def w3-p-s-state-transitions) + (w3-p-s-var-def w3-p-s-transition) + (w3-p-s-var-def w3-p-s-tran-list) + (w3-p-s-var-def w3-p-s-content-model) + (w3-p-s-var-def w3-p-s-except) + ;; Uses free variables: + ;; w3-p-d-current-element, w3-p-d-exceptions + ;; Destroys free variables: + ;; w3-p-s-includep, w3-p-s-state-transitions, w3-p-s-transition, + ;; w3-p-s-tran-list, w3-p-s-content-model, w3-p-s-except + ;; Returns t if the element or data characters should be included. + ;; Returns nil if the element or data characters should be discarded. + (defsubst w3-grok-tag-or-data (tag-name) + (while + (cond + ((symbolp (setq w3-p-s-content-model + (w3-element-content-model w3-p-d-current-element))) + (or (and (memq w3-p-s-content-model + '(CDATA RCDATA XCDATA XXCDATA)) + (memq tag-name '(*data *space))) + ;; *** Implement ANY. + (error "impossible")) + (setq w3-p-s-includep t) + ;; Exit loop. + nil) + (t + ;; We have a complex content model. + ;; Cache some data from the element info structure. Format is: + ;; (INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT) + (setq w3-p-s-state-transitions + (aref w3-p-s-content-model + (w3-element-state w3-p-d-current-element))) + + ;; Optimize the common cases. + (cond + ((eq '*space tag-name) + ;; Optimizing the (*space *discard *same nil) transition. + (setq w3-p-s-includep (car (cdr w3-p-s-state-transitions))) + ;; Don't loop. + nil) + ((and (not (setq w3-p-s-except + (assq tag-name w3-p-d-exceptions))) + (memq tag-name (car w3-p-s-state-transitions))) + ;; Equivalent to a transition of (TAG *include *same nil). + ;; So we are done, return t to caller. + (setq w3-p-s-includep t) + ;; Exit loop. + nil) + (t + ;; The general case. + (cond + ;; Handle inclusions and exclusions. + (w3-p-s-except + (setq w3-p-s-transition (cdr w3-p-s-except))) + ;; See if the transition is in the complex transitions + ;; component. + ((progn + (setq w3-p-s-tran-list + (car (cdr (cdr w3-p-s-state-transitions)))) + (setq w3-p-s-transition nil) + (while w3-p-s-tran-list + (cond ((memq tag-name (car (car w3-p-s-tran-list))) + ;; We've found a transition. + (setq w3-p-s-transition + (cdr (car w3-p-s-tran-list))) + (setq w3-p-s-tran-list nil)) + (t + (setq w3-p-s-tran-list (cdr w3-p-s-tran-list))))) + ;; Check if we found it. + w3-p-s-transition) + ;; body of cond clause empty + ) + ;; Try finding the transition in the DEFAULT component of the + ;; transition table, but avoid doing this for unknown elements, + ;; always use the default-default for them. + ((and (or (eq '*data tag-name) + (w3-known-element-p tag-name)) + (setq w3-p-s-transition + (nth 3 w3-p-s-state-transitions))) + ;; body of cond clause empty + ) + (t + ;; Supply a default-default transition. + (if (not (or (eq '*data tag-name) + (w3-known-element-p tag-name))) + (setq w3-p-s-transition + '(*discard *same "unknown element")) + + ;; Decide whether to *close or *discard + ;; based on whether this element would be + ;; accepted as valid in an open ancestor. + (let ((open-list w3-p-d-open-element-stack) + (all-end-tags-omissible + (w3-element-end-tag-omissible w3-p-d-current-element)) + state-transitions tran-list) + (if (catch 'found + (while open-list + (setq state-transitions + (aref (w3-element-content-model + (car open-list)) + (w3-element-state (car open-list)))) + (if (memq tag-name (car state-transitions)) + (throw 'found t)) + (setq tran-list (nth 2 state-transitions)) + (while tran-list + (cond ((memq tag-name (car (car tran-list))) + (if (not (nth 3 (car tran-list))) + ;; Not an error transition. + (throw 'found t)) + (setq tran-list nil)) + (t + (setq tran-list (cdr tran-list))))) + ;; The input item is not accepted in this + ;; ancestor. Try again in next ancestor. + (or (w3-element-end-tag-omissible (car open-list)) + (setq all-end-tags-omissible nil)) + (setq open-list (cdr open-list))) + nil) + (setq w3-p-s-transition + (if (w3-element-end-tag-omissible + w3-p-d-current-element) + (if all-end-tags-omissible + ;; Probably indicates a need to debug + ;; the DTD state-transition tables. + '(*close *same + "missing transition in DTD?") + ;; Error will be reported later. + '(*close *same)) + '(*close *same "not allowed here"))) + (setq w3-p-s-transition + '(*discard *same "not allowed here"))))))) + + ;; We have found a transition to take. The transition is of + ;; the format (ACTION NEW-STATE ERRORP) where the latter two + ;; items are optional. + + ;; First, handle any state-change. + (or (memq (car-safe (cdr w3-p-s-transition)) '(nil *same)) + (w3-set-element-state + w3-p-d-current-element + (if (eq '*next (car-safe (cdr w3-p-s-transition))) + (1+ (w3-element-state w3-p-d-current-element)) + (car-safe (cdr w3-p-s-transition))))) + + ;; Handle any error message. + (if (car-safe (cdr-safe (cdr w3-p-s-transition))) + (w3-debug-html + :mandatory-if (and (eq '*data tag-name) + (eq '*discard (car w3-p-s-transition))) + (format "Bad %s [%s], %s" + (if (eq '*data tag-name) + "data characters" + (concat "start-tag " + (w3-sgml-name-to-string tag-name))) + (if (stringp (car (cdr (cdr w3-p-s-transition)))) + (car (cdr (cdr w3-p-s-transition))) + "not allowed here") + (let ((action (car w3-p-s-transition))) + (cond ((eq '*discard action) + "discarding bad item") + ((eq '*close action) + (concat "inferring </" + (w3-sgml-name-to-string + (w3-element-name + w3-p-d-current-element)) + ">")) + ((eq '*include action) + "including bad item anyway") + ((eq '*retry action) + "*retry ??? you shouldn't see this") + (t + (concat "inferring <" + (w3-sgml-name-to-string action) + ">"))))))) + + ;; Handle the action. + (cond + ((eq '*include (car w3-p-s-transition)) + (setq w3-p-s-includep t) + ;; Exit loop. + nil) + ((eq '*close (car w3-p-s-transition)) + ;; Perform end-tag inference. + (w3-close-element) ; don't pass parameter + ;; Loop and try again in parent element's content-model. + t) + ((eq '*discard (car w3-p-s-transition)) + (setq w3-p-s-includep nil) + ;; Exit loop. + nil) + ((eq '*retry (car w3-p-s-transition)) + ;; Loop and try again after state change. + t) + ((symbolp (car w3-p-s-transition)) + ;; We need to open another element to contain the text, + ;; probably a <P> (look in the state table). + (w3-open-element (car w3-p-s-transition) nil) + ;; Now we loop and try again in the new element's + ;; content-model. + t) + (t + (error "impossible"))))))) + + ;; Empty while loop body. + ) + + ;; Return value to user indicating whether to include or discard item: + ;; t ==> include + ;; nil ==> discard + w3-p-s-includep) + + ) + + +;;; +;;; Main parser. +;;; + +(defvar w3-last-parse-tree nil + "Used for debugging only. Stores the most recently computed parse tree +\(a tree, not a parse tag stream\).") + +(defun w3-display-parse-tree (&optional ptree) + (interactive) + (with-output-to-temp-buffer "W3 HTML Parse Tree" + (set-buffer standard-output) + (emacs-lisp-mode) + (require 'pp) + (pp (or ptree w3-last-parse-tree)))) + +(defalias 'w3-display-last-parse-tree 'w3-display-parse-tree) + +;; For compatibility with the old parser interface. +(defalias 'w3-preparse-buffer 'w3-parse-buffer) + +;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +;; % % +;; % This is the *ONLY* valid entry point in this file! % +;; % DO NOT call any of the other functions! % +;; % % +;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +(defun w3-parse-buffer (&optional buff nodraw) + "Parse contents of BUFF as HTML. +BUFF defaults to the value of url-working-buffer. +Destructively alters contents of BUFF. +Unless optional second argument NODRAW is non-nil, calls the display +engine on the parsed HTML. +Returns a data structure containing the parsed information." + + (set-buffer (or buff url-working-buffer)) + (setq buff (current-buffer)) + (set-syntax-table w3-sgml-md-syntax-table) + (buffer-disable-undo (current-buffer)) + (widen) ; sanity checking + (goto-char (point-min)) + (setq case-fold-search t) ; allows smaller regexp patterns + + ;; Some unknown pre-parse buffer munging. + (if (fboundp 'sera-to-fidel-marker) + (let ((sera-being-called-by-w3 t)) + ;; eval stops the compiler from complaining. + (eval '(sera-to-fidel-marker)))) + (goto-char (point-min)) + + ;; *** Should premunge line boundaries. + ;; ******************** + + ;; Prepare another buffer to draw in unless told not to. + (if (not nodraw) + (w3-prepare-draw-buffer-for-parse-buffer)) + + (let* ( + ;; Speed hack, see the variable doc string. + (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0) + (* w3-gc-cons-threshold-multiplier + gc-cons-threshold) + gc-cons-threshold)) + + ;; Used to determine if we made any progress since the last loop. + (last-loop-start (point-min)) + + ;; How many iterations of the main loop have occurred. Used only + ;; to send messages to the user periodically, since this function + ;; can take some time. + (loop-count 0) + + ;; Precomputing the loop-invariant parts of this for speed. + (status-message-format + (if url-show-status + (format "Parsed %%3d%%%% of %d..." (- (point-max) (point-min))))) + + ;; Use a float value for 100 if possible, otherwise integer. + ;; Determine which we can use outside of the loop for speed. + (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100)) + + ;; Speed up checking whether to do incremental display. + (w3-do-incremental-display (if nodraw nil w3-do-incremental-display)) + + ;; Used to convert parse tree to tag stream that old display + ;; engine expects. Will change when display engine is rewritten. + (parse-tag-stream '(*dummy)) + + ;; See doc string. + (w3-p-d-parse-tag-stream-tail-pointer parse-tag-stream) + + ;; Points to cons cell in parse-tag-stream whose car is the last + ;; item that has been sent to display engine. + (parse-tag-stream-last-displayed-item parse-tag-stream) + + ;; The buffer which contains the HTML we are parsing. This + ;; variable is used to avoid using the more expensive + ;; save-excursion. + (parse-buffer (current-buffer)) + + ;; Points to start of region of text since the previous tag. + (between-tags-start (point-min)) + + ;; Points past end of region of text since the previous tag. Only + ;; non-nil when the region has been completely determined and is + ;; ready to be processed. + between-tags-end + + ;; See doc string. + w3-p-d-tag-name + + ;; See doc string. + w3-p-d-end-tag-p + + ;; Is the tag we are looking at a null-end-tag-enabling + ;; start-tag? + net-tag-p + + ;; Attributes of the tag we are looking at. An alist whose items + ;; are pairs of the form (SYMBOL . STRING). + tag-attributes + + ;; Points past end of attribute value we are looking at. Points + ;; past the syntactic construct, not the value of the attribute, + ;; which may be at (1- attribute-value-end). + attribute-value-end + + ;; Points past end of tag we are looking at. + tag-end + + ;; See doc string. + (w3-p-d-current-element (w3-fresh-element-for-tag '*document)) + + ;; See doc string. + (w3-p-d-open-element-stack (list (w3-fresh-element-for-tag '*holder))) + + ;; ***not implemented yet*** + (marked-section-undo-stack nil) + + ;; See doc string. + (w3-p-d-debug-url t) + + ;; Any of the following variables with the comment ";*NESTED*" + ;; are syntactic or semantic features that were introduced by + ;; some containing element or marked section which will be undone + ;; when we close that element or marked section. + + ;; See doc string. + (w3-p-d-non-markup-chars nil) ;*NESTED* + + ;; See doc string. + (w3-p-d-null-end-tag-enabled nil) ;*NESTED* + + ;; See doc string. + (w3-p-d-in-parsed-marked-section nil) ;*NESTED* + + ;; See doc string. + (w3-p-d-shortrefs nil) ;*NESTED* + + ;; See doc string. + (w3-p-d-shortref-chars nil) ;*NESTED* + + ;; ******* maybe not needed. + ;; + ;; ;; Are we recognizing start-tags? + ;; (recognizing-start-tags t) ;*NESTED* + ;; + ;; ;; Are we recognizing end-tags? If this is non-nil and not t, + ;; ;; then only the end tag of the current open element is + ;; ;; recognized. + ;; (recognizing-end-tags t) ;*NESTED* + + ;; See doc string. + (w3-p-d-exceptions nil) ;*NESTED* + + ;; Scratch variables used in this function + ref attr-name attr-value content-model content open-list + ) + ;; Scratch variables used by macros and defsubsts we call. + (w3-p-s-let-bindings + + (w3-update-non-markup-chars) + + ;; Main loop. Handle markup as follows: + ;; + ;; non-empty tag: Handle the region since the previous tag as PCDATA, + ;; RCDATA, CDATA, if allowed by syntax. Then handle the tag. + ;; + ;; general entity (&name;): expand it and parse the result. + ;; + ;; shortref (_, {, }, and ^ in math stuff): Expand it and parse the + ;; result. + ;; + ;; SGML marked section (<![ keywords [ conditional-text ]]>): Either + ;; strip the delimiters and parse the result or delete. + ;; + ;; comment: Delete. + ;; + ;; empty tag (<>, </>): Handle as the appropriate tag. + ;; + ;; markup declaration (e.g. <!DOCTYPE ...>): Delete. + ;; + ;; SGML processing instruction (<?name>): Delete. + ;; + (while + ;; Continue as long as we processed something last time and we + ;; have more to process. + (prog1 + (not (and (= last-loop-start (point)) + (eobp))) + (setq last-loop-start (point))) + + ;; Display progress messages if asked and/or do incremental display + ;; of results + (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40)) + (if w3-do-incremental-display + (w3-pause)) + (if status-message-format + (message status-message-format + ;; Percentage of buffer processed. + (/ (* (point) one-hundred) (point-max)))))) + + ;; Go to next interesting thing in the buffer. + (skip-chars-forward w3-p-d-non-markup-chars) + + ;; We are looking at a markup-starting character, and invalid + ;; character, or end of buffer. + (cond + + ((= ?< (following-char)) + + ;; We are looking at a tag, comment, markup declaration, SGML marked + ;; section, SGML processing instruction, or non-markup "<". + (forward-char) + (cond + + ((looking-at "/?\\([a-z][-a-z0-9.]*\\)") + ;; We are looking at a non-empty tag. + + (setq w3-p-d-tag-name + (intern (downcase (buffer-substring (match-beginning 1) + (match-end 1))))) + (setq w3-p-d-end-tag-p (= ?/ (following-char))) + (setq between-tags-end (1- (point))) + (goto-char (match-end 0)) + + ;; Read the attributes from a start-tag. + (or + w3-p-d-end-tag-p + + ;; Attribute values can be: + ;; "STRING" where STRING does not contain the double quote + ;; 'STRING' where STRING does not contain the single quote + ;; name-start character, *name character + ;; *name character + ;; Digit, +name character + ;; +Digit + ;; or a SPACE-separated list of one of the last four + ;; possibilities (there is a comment somewhere that this is a + ;; misinterpretation of the grammar, so we ignore this + ;; possibility). + (while + (looking-at + (eval-when-compile + (concat + ;; Leading whitespace. + "[ \n\r\t]*" + ;; The attribute name, possibly with a bad syntax + ;; component. + "\\([a-z][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)" + ;; Trailing whitespace and perhaps an "=". + "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)"))) + + (cond ((/= (match-beginning 2) (match-end 2)) + (w3-debug-html + :nocontext + (format "Bad attribute name syntax: %s" + (buffer-substring (match-beginning 1) + (match-end 1)))))) + + (setq attr-name + (intern (downcase (buffer-substring (match-beginning 1) + (match-end 1))))) + (goto-char (match-end 0)) + (cond + ((< (match-beginning 4) (match-end 4)) + ;; A value was specified (e.g. ATTRIBUTE=VALUE). + (cond + ((looking-at + (eval-when-compile + (concat + ;; Literal with double quotes. + "\"\\([^\"]*\\)\"" + "\\|" + ;; Literal with single quotes. + "'\\([^']\\)*'" + "\\|" + ;; Handle bad HTML conflicting with NET-enabling + ;; start-tags. + "\\([-a-z0-9.]+/[-a-z0-9._/#]+\\)[ \t\n\r>]" + "\\|" + ;; SGML NAME-syntax attribute value. + "\\([-a-z0-9.]+\\)[ \t\n\r></]" + ))) + (cond + ((or (match-beginning 1) + (match-beginning 2)) + ;; We have an attribute value literal. + (narrow-to-region (1+ (match-beginning 0)) + (1- (match-end 0))) + + ;; In attribute value literals, EE and RS are ignored + ;; and RE and SEPCHAR characters sequences are + ;; replaced by SPACEs. + ;; + ;; (There is no way right now to get RS into one of + ;; these so that it can be ignored. This is due to + ;; our using Unix line-handling conventions.) + (skip-chars-forward "^&\t\n\r") + (if (eobp) + nil + ;; We must expand entities and replace RS, RE, + ;; and SEPCHAR. + (goto-char (point-min)) + (while (progn + (skip-chars-forward "^&") + (not (eobp))) + (w3-expand-entity-at-point-maybe)) + (subst-char-in-region (point-min) (point-max) ?\t 32) + (subst-char-in-region (point-min) (point-max) ?\n 32)) + ;; Set this after we have changed the size of the + ;; attribute. + (setq attribute-value-end (1+ (point-max)))) + ((match-beginning 4) + (setq attribute-value-end (match-end 4)) + (narrow-to-region (point) attribute-value-end)) + ((match-beginning 3) + (setq attribute-value-end (match-end 3)) + (narrow-to-region (point) attribute-value-end) + ;; Horribly illegal non-SGML handling of bad + ;; HTML on the net. This can break valid HTML. + (setq attr-value (buffer-substring (point) + (match-end 3))) + (w3-debug-html :nocontext + (format "Evil attribute value syntax: %s" + (buffer-substring (point-min) (point-max))))) + (t + (error "impossible")))) + ((memq (following-char) '(?\" ?')) + ;; Missing terminating quote character. + (narrow-to-region (point) + (progn + (forward-char 1) + (skip-chars-forward "^ \t\n\r'\"=<>") + (setq attribute-value-end (point)))) + (w3-debug-html :nocontext + (format "Attribute value missing end quote: %s" + (buffer-substring (point-min) (point-max)))) + (narrow-to-region (1+ (point-min)) (point-max))) + (t + ;; We have a syntactically invalid attribute value. Let's + ;; make a best guess as to what the author intended. + (narrow-to-region (point) + (progn + (skip-chars-forward "^ \t\n\r'\"=<>") + (setq attribute-value-end (point)))) + (w3-debug-html :nocontext + (format "Bad attribute value syntax: %s" + (buffer-substring (point-min) (point-max)))))) + ;; Now we have isolated the attribute value. We need to + ;; munge the value depending on the syntax of the + ;; attribute. + ;; *** Right now, we only implement the necessary munging + ;; for CDATA attributes, which is none. I'm not sure why + ;; this happens to work for other attributes right now. + ;; For any other kind of attribute, we are supposed to + ;; * smash case + ;; * remove leading/trailing whitespace + ;; * smash multiple space sequences into single spaces + ;; * verify the syntax of each token + (setq attr-value (buffer-substring (point-min) (point-max))) + (widen) + (goto-char attribute-value-end)) + (t + ;; No value was specified, in which case NAME should be + ;; taken as ATTRIBUTE=NAME where NAME is one of the + ;; enumerated values for ATTRIBUTE. + ;; We assume here that ATTRIBUTE is the same as NAME. + ;; *** Another piece of code will fix the attribute name if it + ;; is wrong. + (setq attr-value (symbol-name attr-name)))) + + ;; Accumulate the attributes. + (setq tag-attributes (cons (cons attr-name attr-value) + tag-attributes)))) + + ;; Process the end of the tag. + (skip-chars-forward " \t\n\r") + (cond ((= ?> (following-char)) + ;; Ordinary tag end. + (forward-char 1)) + ((and (= ?/ (following-char)) + (not w3-p-d-end-tag-p)) + ;; This is a NET-enabling start-tag. + (setq net-tag-p t) + (forward-char 1)) + ((= ?< (following-char)) + ;; *** Strictly speaking, the following text has to + ;; lexically be STAGO or ETAGO, which means that it + ;; can't match some other lexical unit. + ;; Unclosed tag. + nil) + (t + ;; Syntax error. + (w3-debug-html + (format "Bad unclosed %s%s tag" + (if w3-p-d-end-tag-p "/" "") + (w3-sgml-name-to-string w3-p-d-tag-name))))) + + (setq tag-end (point))) + + ((looking-at "/?>") + ;; We are looking at an empty tag (<>, </>). + (setq w3-p-d-end-tag-p (= ?/ (following-char))) + (setq w3-p-d-tag-name (if w3-p-d-end-tag-p + (w3-element-name w3-p-d-current-element) + ;; *** Strictly speaking, if OMITTAG NO, then + ;; we should use the most recently closed tag. + ;; But OMITTAG YES in HTML and I'm lazy. + (w3-element-name w3-p-d-current-element))) + (setq tag-attributes nil) + ;; *** Make sure this is not at top level. + (setq between-tags-end (1- (point))) + (setq tag-end (match-end 0))) + + ;; *** In SGML, <(doctype)element> is valid tag syntax. This + ;; cannot occur in HTML because the CONCUR option is off in the + ;; SGML declaration. + + ((looking-at "!--") + ;; We found a comment, delete to end of comment. + (delete-region + (1- (point)) + (progn + (forward-char 1) + ;; Skip over pairs of -- ... --. + (if (looking-at "\\(--[^-]*\\(-[^-]+\\)*--[ \t\r\n]*\\)+>") + (goto-char (match-end 0)) + ;; Syntax error! + (w3-debug-html + "Bad comment (unterminated or unbalanced \"--\" pairs)") + (forward-char 2) + (or (re-search-forward "--[ \t\r\n]*>" nil t) + (search-forward ">" nil t))) + (point)))) + + ((looking-at "!>\\|\\?[^>]*>") + ;; We are looking at an empty comment or a processing + ;; instruction. Delete it. + (replace-match "") + (delete-char -1)) + + ((looking-at "![a-z]") + ;; We are looking at a markup declaration. Delete it. + ;; *** Technically speaking, to handle valid HTML I think we + ;; need to handle "<!USEMAP ... >" declarations. In the future, + ;; to handle general SGML, we should parse "<!DOCTYPE ... >" + ;; declarations as well (which can contain other declarations). + ;; In the very distant future, perhaps we will handle "<!SGML + ;; ... >" declarations. + ;; *** Should warn if it's not SGML, DOCTYPE, or USEMAP. + (backward-char 1) + (delete-region + (point) + (progn + (condition-case nil + (forward-sexp 1) + (error + ;; *** This might not actually be bad syntax, but might + ;; instead be a -- ... -- comment with unbalanced + ;; parentheses somewhere inside the declaration. Handling + ;; this properly would require full parsing of markup + ;; declarations, a goal for the future. + (w3-debug-html "Bad <! syntax.") + (skip-chars-forward "^>") + (if (= ?> (following-char)) + (forward-char)))) + (point)))) + + ((looking-at "!\\\[\\(\\([ \t\n\r]*[a-z]+\\)+[ \t\n\r]*\\)\\\[") + ;; We are looking at a marked section. + ;; *** Strictly speaking, we should issue a warning if the + ;; keywords are invalid or missing or if the "[" does not follow. + ;; We must look at the keywords to understand how to parse it. + ;; *** Strictly speaking, we should perform parameter entity + ;; substitution on the keywords first. + (goto-char (match-beginning 1)) + (insert ?\)) + (goto-char (1- (match-beginning 0))) + (delete-char 3) + (insert ?\() + (backward-char 1) + (let* ((keywords (read (current-buffer))) + ;; Multiple keywords may appear, but only the most + ;; significant takes effect. Rank order is IGNORE, CDATA, + ;; RCDATA, INCLUDE, and TEMP. INCLUDE and TEMP have the + ;; same effect. + (keyword (car-safe (cond ((memq 'IGNORE keywords)) + ((memq 'CDATA keywords)) + ((memq 'RCDATA keywords)) + ((memq 'INCLUDE keywords)) + ((memq 'TEMP keywords)))))) + (or (= ?\[ (following-char)) + ;; I probably shouldn't even check this, since it is so + ;; impossible. + (error "impossible")) + (forward-char 1) + (delete-region (1- (match-beginning 0)) (point)) + (cond ((eq 'IGNORE keyword) + ;; Scan forward skipping over matching <![ ... ]]> + ;; until we find an unmatched "]]>". + (let ((ignore-nesting 1) + (start-pos (point))) + (while (> ignore-nesting 0) + (if (re-search-forward "<!\\\\\[\\|\]\]>" nil t) + (setq ignore-nesting + (if (eq ?> (preceding-char)) + (1- ignore-nesting) + (1+ ignore-nesting))) + (w3-debug-html + "Unterminated IGNORE marked section.") + (setq ignore-nesting 0) + (goto-char start-pos))) + (delete-region start-pos (point)))) + ((eq 'CDATA keyword) + (error "***unimplemented***")) + ((eq 'RCDATA keyword) + (error "***unimplemented***")) + ((memq keyword '(INCLUDE TEMP)) + (error "***unimplemented***"))))) + ((and (looking-at "!") + w3-netscape-compatible-comments) + ;; Horribly illegal non-SGML handling of bad HTML on the net. + ;; This can break valid HTML. + ;; This arises because Netscape discards anything looking like + ;; "<!...>". So people expect they can use this construct as + ;; a comment. + (w3-debug-html "Evil <! comment syntax.") + (backward-char 1) + (delete-region + (point) + (progn + (skip-chars-forward "^>") + (if (= ?> (following-char)) + (forward-char)) + (point)))) + (t + ;; This < is not a markup character. Pretend we didn't notice + ;; it at all. We have skipped over the < already, so just loop + ;; again. + ))) + + ((= ?& (following-char)) + (w3-expand-entity-at-point-maybe)) + + ((and (= ?\] (following-char)) + w3-p-d-in-parsed-marked-section + (looking-at "]]>")) + ;; *** handle the end of a parsed marked section. + (error "***unimplemented***")) + + ((and (= ?/ (following-char)) + w3-p-d-null-end-tag-enabled) + ;; We are looking at a null end tag. + (setq w3-p-d-end-tag-p t) + (setq between-tags-end (point)) + (setq tag-end (1+ (point))) + (setq w3-p-d-tag-name (w3-element-name w3-p-d-current-element))) + + ;; This can be slow, since we'll hardly ever get here. + ;; *** Strictly speaking, I think we're supposed to handle + ;; shortrefs that begin with the same characters as other markup, + ;; preferring the longest match. + ;; I will assume that shortrefs never begin with <, &, \], /. + ((setq ref (catch 'found-shortref + (let ((refs w3-p-d-shortrefs)) + (while refs + (if (looking-at (car (car refs))) + (throw 'found-shortref (cdr (car refs)))) + (setq refs (cdr refs)))))) + ;; We are looking at a shortref for which there is an + ;; expansion defined in the current syntax. Replace with the + ;; expansion, leaving point at the beginning so it will be parsed + ;; on the next loop. + ;; *** eek. This is wrong if the shortref is for an entity with + ;; CDATA syntax which should not be reparsed for tags. + (replace-match "") + (let ((pt (point))) + (insert ref) + (goto-char pt))) + + ((looking-at (eval-when-compile + (concat "[" (w3-invalid-sgml-chars) "]"))) + (w3-debug-html + (format "Invalid SGML character: %c" (following-char))) + (insert (or (cdr-safe (assq (following-char) + ;; These characters are apparently + ;; from a Windows character set. + '((146 . "'") + (153 . "TM")))) + "")) + (delete-char 1)) + + ((eobp) + ;; We have finished the buffer. Make sure we process the last + ;; piece of text, if any. + (setq between-tags-end (point)) + ;; We have to test what's on the element stack because this + ;; piece of code gets executed twice. + (cond ((not (eq '*holder (w3-element-name w3-p-d-current-element))) + ;; This forces the calculation of implied omitted end tags. + (setq w3-p-d-tag-name '*document) + (setq w3-p-d-end-tag-p t) + (setq tag-end (point))))) + + (t + (error "unreachable code, this can't happen"))) + + ;; If we have determined the boundaries of a non-empty between-tags + ;; region of text, then handle it. + (cond + (between-tags-end + (cond + ((< between-tags-start between-tags-end) + ;; We have a non-empty between-tags region. + + ;; We check if it's entirely whitespace, because we record the + ;; transitions for whitespace separately from those for + ;; data with non-whitespace characters. + (goto-char between-tags-start) + (skip-chars-forward " \t\n\r" between-tags-end) + (cond + ((w3-grok-tag-or-data (prog1 + (if (= between-tags-end (point)) + '*space + '*data) + (goto-char between-tags-end))) + ;; We have to include the text in the current element's + ;; contents. If this is the first item in the current + ;; element's contents, don't include a leading newline if + ;; there is one. Add a trailing newline as a separate text + ;; item so that it can be removed later if it turns out to + ;; be the last item in the current element's contents when + ;; the current element is closed. + ;; *** We could perform this test before calling + ;; w3-grok-tag-or-data, but it's not clear which will be + ;; faster in practice. + (or (setq content (w3-element-content w3-p-d-current-element)) + ;; *** Strictly speaking, in SGML the record end is + ;; carriage return, not line feed. + (if (= ?\n (char-after between-tags-start)) + (setq between-tags-start (1+ between-tags-start)))) + (if (= between-tags-start (point)) + ;; Do nothing. + nil + ;; We are definitely going to add data characters to the + ;; content. + ;; Protocol is that all but last data character item + ;; must have been sent to display engine. + (and content + (stringp (car content)) + ;; Gross, disgusting hack to deal with old interface + ;; to display engine. Remove as soon as possible. + (not (memq (w3-element-name w3-p-d-current-element) + '(plaintext style xmp textarea))) + (w3-add-display-item 'text (car content))) + (cond + ((and (= ?\n (preceding-char)) + (/= between-tags-start (1- (point)))) + (setq content (cons (buffer-substring between-tags-start + (1- (point))) + content)) + ;; Gross, disgusting hack to deal with old interface + ;; to display engine. Remove as soon as possible. + (or (memq (w3-element-name w3-p-d-current-element) + '(plaintext style xmp textarea)) + (w3-add-display-item 'text (car content))) + (setq content (cons "\n" content))) + (t + (setq content (cons (buffer-substring between-tags-start + (point)) + content)))) + (w3-set-element-content w3-p-d-current-element content)))))) + + (setq between-tags-end nil))) + + ;; If the previous expression modified (point), then it went to + ;; the value of between-tags-end. + + ;; If we found a start or end-tag, we need to handle it. + (cond + (w3-p-d-tag-name + + ;; Move past the tag and prepare for next between-tags region. + (goto-char tag-end) + (setq between-tags-start (point)) + + (cond + (w3-p-d-end-tag-p + ;; Handle an end-tag. + (if (eq w3-p-d-tag-name (w3-element-name w3-p-d-current-element)) + (w3-close-element) + ;; Handle the complex version. We have to search up (down?) + ;; the open element stack to find the element that matches (if + ;; any). Then we close all of the elements. On a conforming + ;; SGML document this can do no wrong and it's not + ;; unreasonable on a non-conforming document. + + ;; Can't safely modify stack until we know the element we want + ;; to find is in there, so work with a copy. + (setq open-list w3-p-d-open-element-stack) + (while (and open-list + (not (eq w3-p-d-tag-name + (w3-element-name (car open-list))))) + (setq open-list (cdr open-list))) + (cond (open-list + ;; We found a match. Pop elements. + ;; We will use the following value as a sentinel. + (setq open-list (cdr open-list)) + (while (not (eq open-list w3-p-d-open-element-stack)) + (w3-close-element t)) + (w3-close-element)) + (t + ;; Bogus end tag. + (w3-debug-html + (format "Unmatched end-tag </%s>" + (w3-sgml-name-to-string w3-p-d-tag-name))))))) + (t + ;; Handle a start-tag. + (cond + ;; Check if the new element is allowed in the current element's + ;; content model. + ((w3-grok-tag-or-data w3-p-d-tag-name) + (w3-open-element w3-p-d-tag-name tag-attributes) + + ;; Handle NET-enabling start tags. + (cond ((and net-tag-p + (not w3-p-d-null-end-tag-enabled)) + ;; Save old values. + (w3-set-element-undo-list + w3-p-d-current-element + (cons (cons 'w3-p-d-non-markup-chars + w3-p-d-non-markup-chars) + (cons '(w3-p-d-null-end-tag-enabled . nil) + (w3-element-undo-list w3-p-d-current-element)))) + ;; Alter syntax. + (setq w3-p-d-null-end-tag-enabled t) + (w3-update-non-markup-chars))) + + (setq content-model + (w3-element-content-model w3-p-d-current-element)) + + ;; If the element does not have parsed contents, then we + ;; can find its contents immediately. + (cond + ((memq content-model '(EMPTY CDATA XCDATA XXCDATA RCDATA)) + (cond + ((eq 'EMPTY content-model) + (w3-close-element)) + ((eq 'CDATA content-model) + ;; CDATA: all data characters until an end-tag. We'll + ;; process the end-tag on the next loop. + (if (re-search-forward (if w3-p-d-null-end-tag-enabled + "</[a-z>]\\|/" + "</[a-z>]") + nil 'move) + (goto-char (match-beginning 0)))) + ((eq 'XCDATA content-model) + ;; XCDATA: special non-SGML-standard mode which includes + ;; all data characters until "</foo" is seen where "foo" + ;; is the name of this element (for XMP and LISTING). + (if (search-forward + (concat "</" (symbol-name + (w3-element-name w3-p-d-current-element))) + nil 'move) + (goto-char (match-beginning 0)))) + ((eq 'XXCDATA content-model) + ;; XXCDATA: special non-SGML-standard mode which includes + ;; all data until end-of-entity (end-of-buffer for us) + ;; (for PLAINTEXT). + (goto-char (point-max))) + ((eq 'RCDATA content-model) + ;; RCDATA: all data characters until end-tag is seen, + ;; except that entities are expanded first, although the + ;; expansions are _not_ scanned for end-tags, although the + ;; expansions _are_ scanned for further entity + ;; references. + (while (progn + (if (re-search-forward (if w3-p-d-null-end-tag-enabled + "</[a-z>]\\|[/&]" + "</[a-z>]\\|&") + nil 'move) + (goto-char (match-beginning 0))) + (= ?& (following-char))) + (w3-expand-entity-at-point-maybe))))))) + (t + ;; The element is illegal here. We'll just discard the start + ;; tag as though we never saw it. + )))) + + (setq w3-p-d-tag-name nil) + (setq w3-p-d-end-tag-p nil) + (setq net-tag-p nil) + (setq tag-attributes nil) + (setq tag-end nil))) + + ;; Hand items to the display engine. + (cond ((not nodraw) + (set-buffer w3-draw-buffer) + (while (not (eq parse-tag-stream-last-displayed-item + w3-p-d-parse-tag-stream-tail-pointer)) + (setq parse-tag-stream-last-displayed-item + (cdr parse-tag-stream-last-displayed-item)) + ;; We call w3-handle-single-tag from only one spot so that it + ;; is reasonable to inline it, since it is a big function. + (w3-handle-single-tag + (car (car parse-tag-stream-last-displayed-item)) + (cdr (car parse-tag-stream-last-displayed-item)))) + (set-buffer parse-buffer))) + + ;; End of main while loop. + ) + + ;; We have finished parsing the buffer! + (if status-message-format + (message "%sdone" (format status-message-format 100))) + ;; Do this now so the user can see the full results before Emacs + ;; goes off and garbage-collects for an hour. :-( + (if w3-do-incremental-display + (w3-pause)) + + ;; *** For debugging, save the true parse tree. + ;; *** Make this look inside *DOCUMENT. + (setq w3-last-parse-tree + (w3-element-content w3-p-d-current-element)) + + ;; Return the parse in the format expected, a stream of tags + ;; possibly with a buffer at the front. + (if nodraw + ;; Discard the *dummy item at start of list. + (cdr parse-tag-stream) + (cons w3-draw-buffer (cdr parse-tag-stream))) + + ))) + + +;;; +;;; Initialization of display engine to accept parser output. +;;; + +(defun w3-prepare-draw-buffer-for-parse-buffer () + (setq list-buffers-directory nil) + (let ((buf (get-buffer-create (url-generate-new-buffer-name + "Untitled"))) + (info (mapcar (function (lambda (x) (cons x (symbol-value x)))) + w3-persistent-variables))) + (setq w3-draw-buffer buf) + (save-excursion + (set-window-buffer (selected-window) buf) + (set-buffer buf) + (setq w3-draw-buffer (current-buffer)) + (erase-buffer) + (buffer-disable-undo (current-buffer)) + (mapcar (function (lambda (x) (set (car x) (cdr x)))) info) + (setq w3-last-fill-pos (point)) + (setq fill-column (min (- (or w3-strict-width (window-width)) + w3-right-border) + (or w3-maximum-line-length (window-width)))) + (setq fill-prefix "") + (w3-init-state)))) + + + +(provide 'w3-parse) + +;; Local variables: +;; indent-tabs-mode: nil +;; end: