Mercurial > hg > xemacs-beta
view lisp/w3/w3-parse.el @ 80:1ce6082ce73f r20-0b90
Import from CVS: tag r20-0b90
author | cvs |
---|---|
date | Mon, 13 Aug 2007 09:06:37 +0200 |
parents | 131b0175ea99 |
children | 6a378aca36af |
line wrap: on
line source
;; Created by: Joe Wells, jbw@cs.bu.edu ;; Created on: Sat Sep 30 17:25:40 1995 ;; Filename: w3-parse.el ;; Purpose: Parse HTML and/or SGML for Emacs W3 browser. ;; Copyright © 1995, 1996 Joseph Brian Wells ;; Copyright © 1993, 1994, 1995 by William M. Perry (wmperry@cs.indiana.edu) ;; ;; This program is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 2 of the License, or ;; (at your option) any later version. ;; ;; This program is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to the ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, ;; Boston, MA 02111-1307, USA. ;; ;; On November 13, 1995, the license was available at ;; <URL:ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0>. It may still be ;; obtainable via that URL. ;;; ;;; Trying to make the best of an evil speed hack. ;;; ;; Explanation: ;; Basically, this file provides one big function (w3-parse-buffer) and ;; some data structures. However, to avoid code redundancy, I have broken ;; out some common subexpressions of w3-parse-buffer into separate ;; functions. I have declared these separate functions with "defsubst" so ;; they will be inlined into w3-parse-buffer. Also, I have defined them ;; within eval-when-compile forms, so no definitions will be emitted into ;; the .elc file for these separate functions. (They will work normally ;; when the uncompiled file is loaded.) ;; Each of these subfunctions use some scratch variables in a purely local ;; fashion. In good software design, I would declare these variables as ;; close to their use as possible with "let". However, "let"-binding ;; variables is *SLOW* in Emacs Lisp, even when compiled. Since each of ;; these functions is executed one or more time during each iteration of ;; the main loop, I deemed this too expensive. So the main function does ;; the "let"-binding of these variables. However, I still want to declare ;; them close to their use, partially to keep the compiler from crying ;; "Wolf!" when there is no danger (well, maybe a little danger :-), so I ;; define some macros for this purpose. ;; Also, there are some variables which are updated throughout the file ;; (remember this is really all one function). Some of the code which ;; updates them is located inside the subfunctions. So that the compiler ;; will not complain, these variables are defined with defvar. (require 'w3-vars) (require 'mule-sysdp) (eval-when-compile (defconst w3-p-s-var-list nil "A list of the scratch variables used by functions called by w3-parse-buffer which it is w3-parse-buffer's responsibility to \"let\"-bind.") ;; *** This is unused and does not belong right here anyway. (defmacro w3-resolve-numeric-entity (ent) "Return a string representing the numeric entity ENT (&#ENT;)" (` (if (< (, ent) 256) (char-to-string (, ent)) (format "[Too large character: %s]" (, ent))))) (defmacro w3-p-s-var-def (var) "Declare VAR as a scratch variable which w3-parse-buffer must \"let\"-bind." (` (eval-when-compile (defvar (, var)) (or (memq '(, var) w3-p-s-var-list) (setq w3-p-s-var-list (cons '(, var) w3-p-s-var-list)))))) (defmacro w3-p-s-let-bindings (&rest body) "\"let\"-bind all of the variables in w3-p-s-var-list in BODY." (` (let (, w3-p-s-var-list) (,@ body)))) (put 'w3-p-s-let-bindings 'lisp-indent-function 0) (put 'w3-p-s-let-bindings 'edebug-form-spec t) (defvar w3-p-d-current-element) (put 'w3-p-d-current-element 'variable-documentation "Information structure for the current open element.") (defvar w3-p-d-exceptions) (put 'w3-p-d-exceptions 'variable-documentation "Alist specifying elements (dis)allowed because of an (ex|in)clusion exception of some containing element (not necessarily the immediately containing element). Each item specifies a transition for an element which overrides that specified by the current element's content model. Each item is of the form (TAG ACTION *same ERRORP).") (defvar w3-p-d-in-parsed-marked-section) (put 'w3-p-d-in-parsed-marked-section 'variable-documentation "Are we in a parsed marked section so that we have to scan for \"]]>\"?") (defvar w3-p-d-non-markup-chars) (put 'w3-p-d-non-markup-chars 'variable-documentation "The characters that do not indicate the start of markup, in the format for an argument to skip-chars-forward.") (defvar w3-p-d-null-end-tag-enabled) (put 'w3-p-d-null-end-tag-enabled 'variable-documentation "Is the null end tag (\"/\") enabled?") (defvar w3-p-d-open-element-stack) (put 'w3-p-d-open-element-stack 'variable-documentation "A stack of the currently open elements, with the innermost enclosing element on top and the outermost on bottom.") (defvar w3-p-d-shortrefs) (put 'w3-p-d-shortrefs 'variable-documentation "An alist of the magic entity reference strings in the current between-tags region and their replacements. Each item is of the format \(REGEXP . REPLACEMENT-STRING\). Although in SGML shortrefs normally name entities whose value should be used as the replacement, we have preexpanded the entities for speed. We have also regexp-quoted the strings to be replaced, so they can be used with looking-at. This should never be in an element's overrides field unless w3-p-d-shortref-chars is also in the field.") (defvar w3-p-d-shortref-chars) (put 'w3-p-d-shortref-chars 'variable-documentation "A string of the characters which can start shortrefs in the current between-tags region. This must be in a form which can be passed to skip-chars-forward and must contain exactly the characters which start the entries in w3-p-d-shortrefs. If this variable is mentioned in the overrides field of an element, its handling is magical in that the variable w3-p-d-non-markup-chars is saved to the element's undo-list and updated at the same time. This should never be in an element's overrides field unless w3-p-d-shortrefs is also in the field.") (defvar w3-p-d-tag-name) (put 'w3-p-d-tag-name 'variable-documentation "Name of tag we are looking at, as an Emacs Lisp symbol. Only non-nil when we are looking at a tag.") (defvar w3-p-d-end-tag-p) (put 'w3-p-d-end-tag-p 'variable-documentation "Is the tag we are looking at an end tag? Only non-nil when we are looking at a tag.") ) ;;; ;;; HTML syntax error messages. ;;; (eval-when-compile (defvar w3-p-d-debug-url) (put 'w3-p-d-debug-url 'variable-documentation "Whether to print the URL being parsed before an error messages. Only true for the first error message.") ;; The level parameter indicates whether the error is (1) very ;; serious, must be displayed to all users, (2) invalid HTML, but the ;; user should only be told if the user has indicated interest, or (3) ;; valid HTML which is bad because it appears to rely on the way certain ;; browsers will display it, which should only be displayed to the user ;; if they have really asked for it. (defmacro w3-debug-html (&rest body) "Emit a warning message. These keywords may be used at the beginning of the arguments: :mandatory-if sexp -- force printing if sexp evaluates non-nil. :bad-style -- do not print unless w3-debug-html is 'style. :outer -- do not include the current element in the element context we report. :nocontext -- do not include context where error detected. The remaining parameters are treated as the body of a progn, the value of which must be a string to use as the error message." (let (mandatory-if bad-style outer nocontext condition) (while (memq (car body) '(:mandatory-if :bad-style :outer :nocontext)) (cond ((eq ':mandatory-if (car body)) (setq mandatory-if (car (cdr body))) (setq body (cdr (cdr body)))) ((eq ':bad-style (car body)) (setq bad-style t) (setq body (cdr body))) ((eq ':nocontext (car body)) (setq nocontext t) (setq body (cdr body))) ((eq ':outer (car body)) (setq outer t) (setq body (cdr body))))) (setq condition (if bad-style '(eq 'style w3-debug-html) 'w3-debug-html)) (if mandatory-if (setq condition (` (or (, mandatory-if) (, condition))))) (` (if (, condition) (let ((message (progn (,@ body)))) (if message (w3-debug-html-aux message (,@ (if nocontext (list outer nocontext) (if outer '(t))))))))))) ;; This is unsatisfactory. (put 'w3-debug-html 'lisp-indent-function 0) (put 'w3-debug-html 'edebug-form-spec '([&rest &or ":nocontext" ":outer" [":mandatory-if" form] ":bad-style"] &rest form)) ) (defun w3-debug-html-aux (message &optional outer nocontext) (let ( ;; We have already determined whether the user should see the ;; message, so don't let w3-warn suppress it. (w3-debug-html t)) ;; Print the URL before the first error message for a document. (cond (w3-p-d-debug-url (let ((url (url-view-url t))) (w3-warn 'html (if (or (null url) (string-equal "" url)) (format "HTML errors for buffer %s" (current-buffer)) (format "HTML errors for <URL:%s>" url)))) (setq w3-p-d-debug-url nil))) (w3-warn 'html (if nocontext message (concat message ;; Display context information for each error ;; message. "\n Containing elements: " (w3-open-elements-string (if outer 1)) (concat "\n Text around error: " (save-restriction (widen) (progn (insert "*ERROR*") (prog1 (w3-quote-for-string (buffer-substring (max (- (point) 27) (point-min)) (min (+ (point) 20) (point-max)))) (delete-char -7)))))))))) (defun w3-quote-for-string (string) (save-excursion (set-buffer (get-buffer-create " w3-quote-whitespace")) (erase-buffer) (insert string) (goto-char (point-min)) (insert "\"") (while (progn (skip-chars-forward "^\"\\\t\n\r") (not (eobp))) (insert "\\" (cdr (assq (following-char) '((?\" . "\"") (?\\ . "\\") (?\t . "t") (?\n . "n") (?\r . "r"))))) (delete-char 1)) (insert "\"") (buffer-string))) ;;; ;;; General entity references and numeric character references. ;;; ;; *** MULE conversion? ;; *** I18N HTML support? (let ((html-entities w3-html-entities)) (while html-entities (put (car (car html-entities)) 'html-entity-expansion (cons 'CDATA (if (integerp (cdr (car html-entities))) (char-to-string (mule-make-iso-character (cdr (car html-entities)))) (cdr (car html-entities))))) (setq html-entities (cdr html-entities)))) ;; These are handled differently than the normal HTML entities because ;; we need to define the entities with 'nil instead of 'CDATA so ;; that they are correctly scanned for new markup. ;; ;; from jbw@cs.bu.edu ;; ;;> Of course, this differs from the specification a bit. The W3C tech ;;> report defines all of these as SYSTEM entities. This potentially means ;;> that they can be used in more contexts. The method I outlined above ;;> means "&smiley;" can only be used in contexts where IMG is a valid ;;> element. I am not sure exactly where it is okay to use a SYSTEM entity. ;;> I think anywhere that data characters are accepted. ;; ;; I find this acceptable, as just what the hell are you supposed to do with ;; &computer; as part of a value of a form input when you display it and/or ;; submit it?! (let ((html-entities w3-graphic-entities) (cur nil)) (while html-entities (setq cur (car html-entities) html-entities (cdr html-entities)) (put (nth 0 cur) 'html-entity-expansion (cons 'nil (format "<img src=\"%s/%s%s\" alt=\"%s\">" w3-icon-directory (nth 1 cur) (if w3-icon-format (concat "." (symbol-name w3-icon-format)) "") (or (nth 3 cur) (nth 2 cur))))))) ;; These are the general entities in HTML 3.0 in terms of which the math ;; shortrefs are defined: ;; ;; <!ENTITY REF1 STARTTAG "SUP"> ;; <!ENTITY REF2 ENDTAG "SUP"> ;; <!ENTITY REF3 STARTTAG "SUB"> ;; <!ENTITY REF4 ENDTAG "SUB"> ;; <!ENTITY REF5 STARTTAG "BOX"> ;; <!ENTITY REF6 ENDTAG "BOX"> ;; ;; We're ignoring them because these names should really be local to the ;; DTD and not visible in the document. They might change at any time in ;; future HTML standards. ;; <!--Entities for language-dependent presentation (BIDI and contextual analysis) --> ;; <!ENTITY zwnj CDATA "‌"-- zero width non-joiner--> ;; <!ENTITY zwj CDATA "‍"-- zero width joiner--> ;; <!ENTITY lrm CDATA "‎"-- left-to-right mark--> ;; <!ENTITY rlm CDATA "‏"-- right-to-left mark--> ;; Entity names are case sensitive! ;; & should only be recognized when followed by letter or # and ;; digit or # and letter. (eval-when-compile (w3-p-s-var-def w3-p-s-entity) (w3-p-s-var-def w3-p-s-pos) (w3-p-s-var-def w3-p-s-num) ;; Destroys free variables: ;; w3-p-s-entity, w3-p-s-pos, w3-p-s-num ;; Depends on case-fold-search being t. (defsubst w3-expand-entity-at-point-maybe () ;; We are looking at a &. ;; Only &A or  or &#A syntax is special. (cond ((and (looking-at "&\\([a-z][-a-z0-9.]*\\)[\ ;\n]?") ; \n should be \r ;; We are looking at a general entity reference, maybe undefined. (setq w3-p-s-entity (get (intern (buffer-substring (match-beginning 1) (match-end 1))) 'html-entity-expansion))) ;; If the reference was undefined, then for SGML, we should really ;; issue a warning and delete the reference. However, the HTML ;; standard (contradicting the SGML standard) says to leave the ;; undefined reference in the text. ;; We are looking at a defined general entity reference. (replace-match "") (cond ((eq 'CDATA (car w3-p-s-entity)) ;; Leave point after expansion so we don't rescan it. (insert (cdr w3-p-s-entity))) ((memq (car w3-p-s-entity) '(nil STARTTAG ENDTAG MS MD)) ;; nil is how I mark ordinary entities. ;; The replacement text gets rescanned for all of these. (setq w3-p-s-pos (point)) (insert (cdr (assq (car w3-p-s-entity) '((nil . "") (STARTTAG . "<") (ENDTAG . "</") (MS . "<![") (MD . "<!")))) (cdr w3-p-s-entity) (cdr (assq (car w3-p-s-entity) '((nil . "") (STARTTAG . ">") (ENDTAG . ">") (MS . "]]>") (MD . ">"))))) (goto-char w3-p-s-pos) ;; *** Strictly speaking, if we parse anything from the ;; replacement text, it must end before the end of the ;; replacement text. ) ((eq 'SDATA (car w3-p-s-entity)) (insert "[Unimplemented SDATA \"%s\"]" (cdr w3-p-s-entity))) ((eq 'PI (car w3-p-s-entity)) ;; We are currently ignoring processing instructions. ;; *** Strictly speaking, we should issue a warning if this ;; occurs in a attribute value. ) (t ;; *** We don't handle external entities yet. (error "[Unimplemented entity: \"%s\"]" w3-p-s-entity)))) ((looking-at "&#[0-9][0-9]*\\([\ ;\n]?\\)") ; \n should be \r ;; We are looking at a numeric character reference. ;; Ensure the number is already terminated by a semicolon or carriage ;; return so we can use "read" to get it as a number quickly. (cond ((= (match-beginning 1) (match-end 1)) ;; This is very uncommon, so we don't have to be quick here but ;; rather correct. (save-excursion (goto-char (match-end 0)) ; same as match-end 1 (insert ?\;)) ;; Set up the match data properly (looking-at "&#[0-9][0-9]*;"))) (forward-char 2) (setq w3-p-s-num (read (current-buffer))) ;; Always leave point after the expansion of a numeric ;; character reference, like it were a CDATA entity. (replace-match "") ;; char-to-string will hopefully do something useful with characters ;; larger than 255. I think in MULE it does. Is this true? ;; Bill wants to call w3-resolve-numeric-entity here, but I think ;; that functionality belongs in char-to-string. ;; The largest valid character in the I18N version of HTML is 65533. ;; ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt ;; wrongo! Apparently, mule doesn't do sane things with char-to-string ;; -wmp 7/9/96 (insert (char-to-string (mule-make-iso-character w3-p-s-num)))) ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r (replace-match (assq (upcase (char-after (+ 3 (point)))) '(;; *** Strictly speaking, record end should be ;; carriage return. (?E . "\n") ; RE ;; *** And record start should be line feed. (?S . "") ; RS (?P . " ") ; SPACE (?A . "\t")))) ; TAB ;; Leave point after the expansion of a character reference, so it ;; doesn't get rescanned. ;; *** Strictly speaking, we should issue a warning for &#foo; if foo ;; is not a function character in the SGML declaration. ) ((eq ?& (following-char)) ;; We are either looking at an undefined reference or a & that does ;; not start a reference (in which case we should not have been called). ;; Skip over the &. (forward-char 1)) (t ;; What is the code doing calling us if we're not looking at a "&"? (error "this should never happen")))) ) ;;; ;;; Syntax table used in markup declarations. ;;; (defvar w3-sgml-md-syntax-table (let ((table (make-syntax-table)) (items '( (0 "." 255) ; clear everything (?\r " ") (?\t " ") (?\n " ") (32 " ") ; space (?< "\(>") (?> "\)<") (?\( "\(\)") (?\) "\)\(") (?\[ "\(\]") (?\] "\)\[") (?\" "\"") (?\' "\"") (?a "w" ?z) (?A "w" ?Z) (?0 "w" ?9) (?. "w") ;; "-" can be a character in a NAME, but it is also used in ;; "--" as both a comment start and end within SGML ;; declarations ("<!" ... ">"). In HTML, it is only used ;; as a NAME character in the parameter entities ;; Content-Type, HTTP-Method, and style-notations and in ;; the attribute name http-equiv and in the notation names ;; dsssl-lite and w3c-style. We would like to be able to ;; train Emacs to skip over these kinds of comments with ;; forward-sexp and backward-sexp. Is there any way to ;; teach Emacs how to do this? It doesn't seem to be the ;; case. (?- "w") ))) (while items (let* ((item (car items)) (char (car item)) (syntax (car (cdr item))) (bound (or (car-safe (cdr-safe (cdr item))) char))) (while (<= char bound) (modify-syntax-entry char syntax table) (setq char (1+ char)))) (setq items (cdr items))) table) "A syntax table for parsing SGML markup declarations.") ;;; ;;; Element information data type. ;;; ;; The element information data type is used in two ways: ;; ;; * To store the DTD, there is one element record for each element in ;; the DTD. ;; ;; * To store information for open elements in the current parse tree. ;; Each such element is initialized by copying the element record ;; from the DTD. This means that values in the fields can not be ;; destructively altered, although of course the fields can be ;; changed. ;; The cells in this vector are: ;; ;; name: the element's name (a generic identifier). ;; ;; end-tag-name: a symbol whose name should be the result of prefixing ;; the generic-identifier with a slash. This is a convenience value for ;; interfacing with the display engine which expects a stream of start ;; and end tags in this format rather than a tree. ;; ;; content-model: a data structure describing what elements or character ;; data we expect to find within this element. This is either a symbol ;; listed here: ;; ;; EMPTY: no content, no end-tag allowed. ;; CDATA: all data characters until "</[a-z]" is seen. ;; XCDATA: special non-SGML-standard mode which includes all data ;; characters until "</foo" is seen where "foo" is the name of this ;; element. (for XMP and LISTING) ;; XXCDATA: special non-SGML-standard mode which includes all data ;; until end-of-entity (end-of-buffer for us). (for PLAINTEXT) ;; RCDATA: all data characters until "</[a-z]" is seen, except that ;; entities are expanded first, although the expansions are not ;; scanned for end-tags. ;; XINHERIT: special non-SGML-standard mode which means to use the ;; content model of the containing element instead. ;; ;; or a vector of this structure: ;; ;; [(INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT) ...] ;; ;; where INCLUDES is of the format: ;; ;; (TAG ...) ;; ;; where each TRANSITION is one of these: ;; ;; (ACTION NEW-STATE ERRORP) ;; (ACTION NEW-STATE) ;; (ACTION) ;; ;; where DEFAULT is one of these: ;; ;; nil or TRANSITION ;; ;; where the meaning of the components is: ;; ;; INCLUDES is a list of tags for which the transition (*include *same ;; nil) applies. ;; ;; DEFAULT if non-nil is a transition that should be taken when ;; matching any possibility not explicitly listed in another ;; TRANSITION, except for data characters containing only whitespace. ;; ;; INCSPACEP specifies how to handle data characters which include ;; only whitespace characters. The value is non-nil to indicate ;; (*include *same nil) or nil to indicate (*discard *same nil). ;; ;; TAG is a symbol corresponding to the start-tag we are looking at, ;; or *data when seeing character data that includes at least one ;; non-space character. ;; ;; ACTION is one of: ;; *close: Close this element and try again using content model of ;; enclosing element. (Note that this does not apply to the ;; case of an element being closed by its own end-tag.) ;; *include: Process new element as subelement of this one or ;; include data characters directly. ;; *discard: Discard a start-tag or data characters. ;; *retry: Try again after processing NEW-STATE and ERRORP. ;; ELEMENT: Open ELEMENT (with default attributes), then try again ;; using its content model. ;; ;; NEW-STATE (optional, default *same) is the index of the state to ;; move to after processing the element or one of these: ;; *same: no state change occurs. ;; *next: change the current state + 1. ;; The initial state is 0. NEW-STATE does not matter if ACTION is ;; *close. ;; ;; ERRORP (optional, default nil) if non-nil indicates this transition ;; represents an error. The error message includes this value if it ;; is a string. ;; ;; If no matching transition is found, the default transition is ;; (*discard *same "not allowed here"). ;; ;; overrides: An alist of pairs of the form (VAR REPLACEP . VALUE). ;; When this element is opened, the old value of VAR is saved in the ;; undo-list. If REPLACEP is non-nil, then VAR gets value VALUE, ;; otherwise VAR gets value (append VALUE (symbol-value VAR)). Useful ;; values for VAR are: ;; ;; w3-p-d-exceptions: See doc string. ;; ;; w3-p-d-shortrefs: See doc string. ;; ;; w3-p-d-shortref-chars: See doc string. ;; ;; end-tag-omissible: Whether it is legal to omit the end-tag of this ;; element. If an end-tag is inferred for an element whose end tag is ;; not omissible, an error message is given. ;; ;; state: The current state in the content model. Preset to the initial ;; state of 0. ;; ;; undo-list: an alist of of former values of local variables ;; of w3-parse-buffer to restore upon closing this element. Each ;; item on the list is of the format (VAR . VALUE-TO-RESTORE). ;; ;; attributes: an alist of attributes and values. Each item on ;; this list is of the format (ATTRIBUTE-NAME . VALUE). Each ;; ATTRIBUTE-NAME is a symbol and each attribute value is a ;; string. ;; ;; content: a list of the accumulated content of the element. While the ;; element is open, the list is in order from latest to earliest, ;; otherwise it is in order from earliest to latest. Each member is ;; either a string of data characters or a list of the form (NAME ;; ATTRIBUTES CONTENT), where NAME is the subelement's name, ATTRIBUTES ;; is an alist of the subelement's attribute names (lowercase symbols) ;; and their values (strings), and CONTENT is the subelement's content. (eval-when-compile (defconst w3-element-fields '(name end-tag-name content-model state overrides undo-list content attributes end-tag-omissible deprecated)) (let* ((fields w3-element-fields) (index (1- (length fields)))) (while fields (let* ((field (symbol-name (car fields))) (get-sym (intern (concat "w3-element-" field))) (set-sym (intern (concat "w3-set-element-" field)))) (eval (` (progn (defmacro (, get-sym) (element) (list 'aref element (, index))) (defmacro (, set-sym) (element value) (list 'aset element (, index) value)))))) (setq fields (cdr fields)) (setq index (1- index)))) (defmacro w3-make-element () (list 'make-vector (length w3-element-fields) nil)) ;; *** move this to be with DTD declaration. (defmacro w3-fresh-element-for-tag (tag) (` (copy-sequence (or (get (, tag) 'html-element-info) (error "unimplemented element %s" (w3-sgml-name-to-string (, tag))))))) ;; *** move this to be with DTD declaration. (defmacro w3-known-element-p (tag) (` (get (, tag) 'html-element-info))) (defsubst w3-sgml-name-to-string (sym) (upcase (symbol-name sym))) ) ;;; ;;; Parse tree manipulation. ;;; ;; ;; Find the name of the previous element or a substring of the ;; ;; preceding data characters. ;; (let ((content (w3-element-content (car stack)))) ;; (while content ;; (cond ;; ((and (stringp (car content)) ;; (not (string-match "\\`[ \t\n\r]*\\'" (car content)))) ;; (setq prior-item (car content)) ;; ;; Trim trailing whitespace ;; (if (string-match "\\(.*[^ \t\n\r]\\)[ \t\n\r]*\\'" prior-item) ;; (setq prior-item (substring prior-item 0 (match-end 1)))) ;; (if (> (length prior-item) 8) ;; (setq prior-item (concat "..." (substring prior-item -8)))) ;; (setq prior-item (w3-quote-for-string prior-item)) ;; (setq prior-item (concat "\(after " prior-item "\)")) ;; (setq content nil)) ;; ((and (consp (car content)) ;; (symbolp (car (car content)))) ;; (setq prior-item ;; (concat "\(after " ;; (w3-sgml-name-to-string (car (car content))) ;; "\)")) ;; (setq content nil)) ;; (t ;; (setq content (cdr content)))))) ;; Only used for HTML debugging. (defun w3-open-elements-string (&optional skip-count) (let* ((stack (nthcdr (or skip-count 0) (cons w3-p-d-current-element w3-p-d-open-element-stack))) ;;(prior-item "(at start)") result) ;; Accumulate the names of the enclosing elements. (while stack (let ((element (w3-element-name (car stack)))) (if (eq '*holder element) nil ;; Only include *DOCUMENT if there are no other elements. (if (or (not (eq '*document element)) (null result)) (setq result (cons (w3-sgml-name-to-string element) result))))) (setq stack (cdr stack))) (setq result (mapconcat 'identity result ":")) (if result ;;(concat result ;; prior-item) "[nowhere!]"))) ;; *** This doesn't really belong here, but where? (eval-when-compile (defmacro w3-invalid-sgml-chars () "Characters not allowed in an SGML document using the reference concrete syntax (i.e. HTML). Returns a string in the format expected by skip-chars-forward." "\000-\010\013\014\016-\037\177-\237")) (eval-when-compile ;; Uses: ;; w3-p-d-null-end-tag-enabled, w3-p-d-in-parsed-marked-section, ;; w3-p-d-shortref-chars ;; Modifies free variable: ;; w3-p-d-non-markup-chars (defsubst w3-update-non-markup-chars () (setq w3-p-d-non-markup-chars (concat "^&<" (w3-invalid-sgml-chars) (if w3-p-d-null-end-tag-enabled "/" "") (if w3-p-d-in-parsed-marked-section "]" "") (or w3-p-d-shortref-chars "")))) ) (eval-when-compile (w3-p-s-var-def w3-p-s-overrides) (w3-p-s-var-def w3-p-s-undo-list) (w3-p-s-var-def w3-p-s-var) ;; Uses free variables: ;; w3-p-d-non-markup-chars ;; Modifies free variables: ;; w3-p-d-current-element, w3-p-d-open-element-stack ;; Destroys free variables: ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var (defsubst w3-open-element (tag attributes) ;; Push new element on stack. (setq w3-p-d-open-element-stack (cons w3-p-d-current-element w3-p-d-open-element-stack)) (setq w3-p-d-current-element (w3-fresh-element-for-tag tag)) ;; Warn if deprecated or obsolete. (if (w3-element-deprecated w3-p-d-current-element) (w3-debug-html :outer (format "%s element %s." (if (eq 'obsolete (w3-element-deprecated w3-p-d-current-element)) "Obsolete" "Deprecated") (w3-sgml-name-to-string (w3-element-name w3-p-d-current-element))))) ;; Store attributes. ;; *** we are not handling #CURRENT attributes (HTML has none). (w3-set-element-attributes w3-p-d-current-element attributes) ;; *** Handle default attribute values. ;; *** Fix the attribute name for unnamed values. Right now they will ;; be in the attribute list as items of the format (VALUE . VALUE) where ;; both occurrences of VALUE are the same. The first one needs to be ;; changed to the proper attribute name by consulting the DTD. ;; ******************** ;; Handle syntax/semantics overrides of new current element. (cond ((w3-element-overrides w3-p-d-current-element) (setq w3-p-s-overrides (w3-element-overrides w3-p-d-current-element)) (setq w3-p-s-undo-list nil) (while w3-p-s-overrides (setq w3-p-s-var (car (car w3-p-s-overrides))) (setq w3-p-s-undo-list (cons (cons w3-p-s-var (symbol-value w3-p-s-var)) w3-p-s-undo-list)) (set w3-p-s-var (if (car (cdr (car w3-p-s-overrides))) (cdr (cdr (car w3-p-s-overrides))) (append (cdr (cdr (car w3-p-s-overrides))) (symbol-value w3-p-s-var)))) ;; *** HACK HACK. ;; Magic handling of w3-p-d-shortref-chars. (cond ((eq 'w3-p-d-shortref-chars w3-p-s-var) (setq w3-p-s-undo-list (cons (cons 'w3-p-d-non-markup-chars w3-p-d-non-markup-chars) w3-p-s-undo-list)) (w3-update-non-markup-chars))) (setq w3-p-s-overrides (cdr w3-p-s-overrides))) (w3-set-element-undo-list w3-p-d-current-element w3-p-s-undo-list))) ;; Handle content-model inheritance. (Very non-SGML!) (if (eq 'XINHERIT (w3-element-content-model w3-p-d-current-element)) (w3-set-element-content-model w3-p-d-current-element (w3-element-content-model (car w3-p-d-open-element-stack)))) ) ) ;; The protocol for handing items to the display engine is as follows. ;; ;; For an element, send (START-TAG . ATTS), each member of the content, ;; and (END-TAG . nil) if the element is allowed to have an end tag. ;; ;; For data characters, send (text . DATA-CHARACTERS). ;; ;; Exceptions: ;; ;; For PLAINTEXT, STYLE, XMP, TEXTAREA send: ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTS)). ;; ;; *** This requires somehow eliminating any subelements of the TEXTAREA ;; element. TEXTAREA can contain subelements in HTML 3.0. ;; ;; For LISTING, send (text . DATA-CHARACTERS). (Is this really correct or ;; is this perhaps a bug in the old parser?) I'm ignoring this for now. (eval-when-compile (w3-p-s-var-def w3-p-s-undo-list) (w3-p-s-var-def w3-p-s-content) (w3-p-s-var-def w3-p-s-end-tag) ;; Modifies free variables: ;; w3-p-d-current-element, w3-p-d-open-element-stack ;; Accesses free variables: ;; w3-p-d-tag-name, w3-p-d-end-tag-p ;; Destroys free variables: ;; w3-p-s-undo-list, w3-p-s-content, w3-p-s-end-tag (defsubst w3-close-element (&optional inferred) ;; inferred: non-nil if the end-tag of the current element is being ;; inferred due to the presence of content not allowed in the current ;; element. If t, then the tag causing this is in w3-p-d-tag-name and ;; w3-p-d-end-tag-p. ;; (OLD: ... otherwise it is a symbol indicating the start-tag ;; of an element or *data or *space indicating data characters.) (cond ((and inferred (not (w3-element-end-tag-omissible w3-p-d-current-element))) (w3-debug-html (format "</%s> end-tag not omissible (required due to %s)" (w3-sgml-name-to-string (w3-element-name w3-p-d-current-element)) (cond ((eq t inferred) (format (if w3-p-d-end-tag-p "</%s> end-tag" "start-tag for %s") (w3-sgml-name-to-string w3-p-d-tag-name))) ;; *** Delete this functionality? ((memq inferred '(*space *data)) "data characters") ((symbolp inferred) (format "start-tag for %s" (w3-sgml-name-to-string inferred))) ))))) ;; Undo any variable bindings of this element. (cond ((w3-element-undo-list w3-p-d-current-element) (setq w3-p-s-undo-list (w3-element-undo-list w3-p-d-current-element)) (while w3-p-s-undo-list (set (car (car w3-p-s-undo-list)) (cdr (car w3-p-s-undo-list))) (setq w3-p-s-undo-list (cdr w3-p-s-undo-list))))) (setq w3-p-s-end-tag (w3-element-end-tag-name w3-p-d-current-element)) ;; Fix up the content of the current element in preparation for putting ;; it in the parent. ;; Remove trailing newline from content, if there is one, otherwise send ;; any trailing data character item to display engine. (setq w3-p-s-content (w3-element-content w3-p-d-current-element)) (cond ((null w3-p-s-content)) ((equal "\n" (car w3-p-s-content)) (setq w3-p-s-content (cdr w3-p-s-content))) ) (cond ;; *** Handle LISTING the way the old parser did. ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element)) ;; Do nothing, can't have an end tag. ) (t ;; Normal case. (if (null w3-p-s-content) (w3-debug-html :bad-style :outer ;; Don't warn for empty TD elements or empty A elements ;; with no HREF attribute. ;; *** Crude hack that should really be encoded in the ;; element database somehow. (if (or (not (memq (w3-element-name w3-p-d-current-element) '(a td))) (assq 'href (w3-element-attributes w3-p-d-current-element))) (format "Empty %s element." (w3-sgml-name-to-string (w3-element-name w3-p-d-current-element)))))))) ;; Put the current element in the proper place in its parent. ;; This will cause an error if we overpop the stack. (w3-set-element-content (car w3-p-d-open-element-stack) (cons (list (w3-element-name w3-p-d-current-element) (w3-element-attributes w3-p-d-current-element) (nreverse w3-p-s-content)) (w3-element-content (car w3-p-d-open-element-stack)))) ;; Pop the stack. (setq w3-p-d-current-element (car w3-p-d-open-element-stack)) (setq w3-p-d-open-element-stack (cdr w3-p-d-open-element-stack))) ) ;;; ;;; A pseudo-DTD for HTML. ;;; (eval-when-compile ;; This works around the following bogus compiler complaint: ;; While compiling the end of the data in file w3-parse.el: ;; ** the function w3-expand-parameters is not known to be defined. ;; This is a bogus error. Anything of this form will trigger this message: ;; (eval-when-compile (defun xyzzy () (xyzzy))) (defun w3-expand-parameters (pars data) nil)) (eval-when-compile (defun w3-expand-parameters (pars data) (cond ((null data) nil) ((consp data) ;; This has to be written carefully to avoid exceeding the ;; maximum lisp function call nesting depth. (let (result) (while (consp data) (let ((car-exp (w3-expand-parameters pars (car data)))) (setq result (if (and (symbolp (car data)) (not (eq car-exp (car data))) ;; An expansion occurred. (listp car-exp)) ;; The expansion was a list, which we splice in. (condition-case err (append (reverse car-exp) result) (wrong-type-argument (if (eq 'listp (nth 1 err)) ;; Wasn't really a "list" since the last ;; cdr wasn't nil, so don't try to splice ;; it in. (cons car-exp result) (signal (car err) (cdr err))))) (cons car-exp result)))) (setq data (cdr data))) (append (nreverse result) (w3-expand-parameters pars data)))) ((symbolp data) (let ((sym-exp (cdr-safe (assq data pars)))) (if sym-exp (w3-expand-parameters pars sym-exp) data))) ((vectorp data) (let ((i 0) (result (copy-sequence data))) (while (< i (length data)) (aset result i (w3-expand-parameters pars (aref data i))) (setq i (1+ i))) result)) (t data)))) (eval-when-compile (defun w3-unfold-dtd (items) (let (result) (while items (let* ((item (car items)) (names (car item)) (content-model (or (cdr-safe (assq 'content-model item)) (error "impossible"))) (overrides (cdr-safe (assq 'overrides item))) (end-tag-omissible (or (cdr-safe (assq 'end-tag-omissible item)) ;; *** Is this SGML standard? (eq 'EMPTY content-model))) (deprecated (cdr-safe (assq 'deprecated item))) element name) (while names (setq name (car names)) (setq names (cdr names)) ;; Create and initialize the element information data ;; structure. (setq element (w3-make-element)) (w3-set-element-name element name) (w3-set-element-end-tag-name element (intern (concat "/" (symbol-name name)))) (w3-set-element-state element 0) (w3-set-element-content-model element content-model) (w3-set-element-end-tag-omissible element end-tag-omissible) (or (memq deprecated '(nil t obsolete)) (error "impossible")) (w3-set-element-deprecated element deprecated) ;; Inclusions and exclusions are specified differently in the ;; human-coded DTD than in the format the implementation uses. ;; The human-coded version is designed to be easy to edit and to ;; work with w3-expand-parameters while the internal version is ;; designed to be fast. We have to translate here. This work ;; is repeated for every element listed in `names' so that the ;; exclusion exception error messages can be accurate. (let ((inclusions (cdr-safe (assq 'inclusions item))) (exclusions (cdr-safe (assq 'exclusions item))) (exclusion-mode '*close) (exclusion-message (format "%s exclusion" (w3-sgml-name-to-string name))) exceptions) (while inclusions (setq exceptions (cons (cons (car inclusions) '(*include *same nil)) exceptions)) (setq inclusions (cdr inclusions))) (while exclusions (cond ((memq (car exclusions) '(*discard *include *close)) (setq exclusion-mode (car exclusions))) ((stringp (car exclusions)) (setq exclusion-message (car exclusions))) (t (setq exceptions (cons (list (car exclusions) exclusion-mode '*same exclusion-message) exceptions)))) (setq exclusions (cdr exclusions))) (let ((overrides (if exceptions (cons (cons 'w3-p-d-exceptions (cons nil exceptions)) overrides) overrides))) (w3-set-element-overrides element overrides))) (setq result (cons (cons name element) result)))) (setq items (cdr items))) result))) ;; Load the HTML DTD. ;; <URL:ftp://ds.internic.net/rfc/rfc1866.txt> ;; *** Be sure to incorporate rfc1867 when attribute-checking is added. ;; *** Write function to check sanity of the content-model forms. ;; *** I18N: Add Q, BDO, SPAN (mapcar (function (lambda (pair) (put (car pair) 'html-element-info (cdr pair)))) ;; The purpose of this complexity is to speed up loading by ;; pre-evaluating as much as possible at compile time. (eval-when-compile (w3-unfold-dtd (w3-expand-parameters '( (%headempty . (link base meta range)) (%headmisc . (script)) (%head-deprecated . (nextid)) ;; client-side imagemaps (%imagemaps . (area map)) ;; special action is taken for %text inside %body.content in the ;; content model of each element. (%body.content . (%heading %block style hr div address %imagemaps)) (%heading . (h1 h2 h3 h4 h5 h6)) ;; Emacs-w3 extensions (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek)) (%block . (p %list dl form %preformatted %blockquote isindex fn table fig note center %block-deprecated %block-obsoleted)) (%list . (ul ol)) (%preformatted . (pre)) (%blockquote . (bq)) (%block-deprecated . (dir menu blockquote)) (%block-obsoleted . (xmp listing)) ;; Why is IMG in this list? (%pre.exclusion . (*include img *discard tab math big small sub sup)) (%text . (*data b %notmath sub sup %emacsw3-crud)) (%notmath . (%special %font %phrase %misc)) (%font . (i u s strike tt big small sub sup font roach secret wired)) ;; B left out for MATH (%phrase . (em strong dfn code samp kbd var cite blink)) (%special . (a img applet object font basefont br script style map math tab span bdo)) (%misc . (q lang au person acronym abbrev ins del)) (%formula . (*data %math)) (%math . (box above below %mathvec root sqrt array sub sup %mathface)) (%mathvec . (vec bar dot ddot hat tilde)) (%mathface . (b t bt)) (%mathdelims . (over atop choose left right of)) ;; What the hell? This takes BODYTEXT????? No way! (%bq-content-model . [(nil nil (((bodytext) *include *next)) (bodytext *next)) (nil nil (((credit) *include *next)) nil) (nil nil nil nil) ]) ;; non-default bad HTML handling. (%in-text-ignore . ((p %heading) *discard *same error)) ) '( ;; A dummy element that will contain *document. ((*holder) (content-model . [(nil nil nil nil)])) ;; The root of the parse tree. We start with a pseudo-element ;; named *document for convenience. ((*document) (content-model . [(nil nil (((html) *include *next)) (html *next)) (nil nil nil (*include *same "after document end"))]) (end-tag-omissible . t)) ;; HTML O O (HEAD, BODY) ((html) (content-model . [(nil nil (((head) *include *next)) (head *next)) (nil nil (((body) *include *next) ;; Netscape stuff ((frameset) *include 4) ) (body *next)) (nil nil (((plaintext) *include *next)) (*retry *next)) (nil nil nil (*include *same "after BODY")) (nil nil nil (*include *same "after FRAMESET")) ]) (end-tag-omissible . t)) ((head) (content-model . [((title isindex %headempty %headmisc style %head-deprecated) nil nil ;; *** Should only close if tag can ;; legitimately follow head. So many can that ;; I haven't bothered to enumerate them. (*close))]) (end-tag-omissible . t)) ;; SCRIPT - - (#PCDATA) ((script) (content-model . XCDATA ; not official, but allows ; comment hiding of script, and also ; idiots that use '</' in scripts. )) ;; TITLE - - (#PCDATA) ((title) (content-model . RCDATA ; not official ;; [((*data) include-space nil nil)] )) ;; STYLE - O (#PCDATA) ;; STYLE needs to be #PCDATA to allow omitted end tag. Bleagh. ((style) (content-model . CDATA) (end-tag-omissible . t)) ((body) (content-model . [((banner) nil nil (*retry *next)) ((bodytext) nil nil (bodytext *next)) (nil nil (((plaintext) *close)) nil)]) (inclusions . (spot)) (end-tag-omissible . t)) ;; Do I really want to include BODYTEXT? It has something to do ;; with mixed content screwing things up, and I don't understand ;; it. Wait! It's used by BQ! ((bodytext) (content-model . [((%body.content) nil ;; Push <P> before data characters. Non-SGML. (((%text) p) ;; Closing when seeing CREDIT is a stupidity ;; caused by BQ's sharing of BODYTEXT. BQ ;; should have its own BQTEXT. ((credit plaintext) *close)) nil)]) (end-tag-omissible . t)) ((div banner center) (content-model . [((%body.content) nil ;; Push <P> before data characters. Non-SGML. (((%text) p)) nil)])) ((address) (content-model . [((p) nil ;; Push <P> before data characters. Non-SGML. (((%text) p)) nil)])) ((%heading) (content-model . [((%text) include-space ((%in-text-ignore)) nil)])) ((span bdo) (content-model . [((%text) include-space nil nil)]) ) ((p) (content-model . [((%text) include-space nil ;; *** Should only close if tag can ;; legitimately follow P. So many can that I ;; don't bother to enumerate here. (*close))]) (end-tag-omissible . t)) ((ul ol) (content-model . [((lh) nil (((li) *include *next)) (*retry *next)) ((li) nil ;; Push <LI> before data characters or block ;; elements. ;; Non-SGML. (((%text %block) li *same error)) nil)])) ((lh) (content-model . [((%text) include-space (((dd dt li) *close) (%in-text-ignore)) nil)]) (end-tag-omissible . t)) ((dir menu) (content-model . [((li) nil (((%text) li *same error)) nil)]) (exclusions . (%block)) (deprecated . t)) ((li) (content-model . [((%block) nil (((li) *close) ;; Push <P> before data characters. Non-SGML. ((%text) p)) nil)]) (end-tag-omissible . t) ;; Better bad HTML handling. ;; Technically, there are a few valid documents that this will ;; hose, because you can have H1 inside FORM inside LI. However, ;; I don't think that should be allowed anyway. (exclusions . (*discard "not allowed here" %heading))) ((dl) (content-model . [((lh) nil (((dt dd) *include *next)) (*retry *next)) ((dt dd) nil ;; Push <DD> before data characters or block ;; items. ;; Non-SGML. (((%text %block) dd *same error)) nil)])) ((dt) (content-model . [((%text) include-space (((dd dt) *close) (%in-text-ignore)) nil)]) (end-tag-omissible . t)) ;; DD is just like LI, but we treat it separately because it can be ;; followed by a different set of elements. ((dd) (content-model . [((%block) nil (((dt dd) *close) ;; Push <P> before data characters. Non-SGML. ((%text) p)) nil)]) (end-tag-omissible . t) ;; See comment with LI. (exclusions . (*discard "not allowed here" %heading))) ((pre) (content-model . [((%text hr) include-space ((%in-text-ignore)) nil)]) (exclusions . (%pre.exclusion))) ;; BLOCKQUOTE deprecated, BQ okay ((bq) (content-model . %bq-content-model)) ((blockquote) (content-model . %bq-content-model) ;; BLOCKQUOTE is deprecated in favor of BQ in the HTML 3.0 DTD. ;; However, BQ is not even mentioned in the HTML 2.0 DTD. So I ;; don't think we can enable this yet: ;;(deprecated . t) ) ((fn note) (content-model . [((%body.content) nil ;; Push <P> before data characters. Non-SGML. (((%text) p)) nil)])) ((fig) (content-model . [((overlay) nil nil (*retry *next)) (nil nil (((caption) *include *next)) (*retry *next)) (nil nil (((figtext) *include *next) ((credit) *retry *next)) ;; *** Should only do this for elements that ;; can be in FIGTEXT. (figtext *next)) (nil nil (((credit) *include *next)) nil) (nil nil nil nil)])) ((caption credit) (content-model . [((%text) nil ((%in-text-ignore)) nil)])) ((figtext) (content-model . [((%body.content) nil ;; Push <P> before data characters. Very non-SGML. (((%text) p) ((credit) *close)) nil)]) (end-tag-omissible . t)) ((%emacsw3-crud basefont) (content-model . EMPTY)) ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA) ((form) ;; Same as BODY. Ugh! (content-model . [((%body.content) nil ;; Push <P> before data characters. Non-SGML. (((%text) p)) nil)]) (exclusions . (form)) (inclusions . (input select textarea keygen label))) ;; *** Where is the URL describing this? ((label) (content-model . [((%text) include-space nil nil)]) ;; *** These are already included, no need to repeat. ;;(inclusions . (input select textarea)) ;; *** Is a LABEL allowed inside a LABEL? I assume no. (exclusions . (label)) ;; The next line just does the default so is unneeded: ;;(end-tag-omissible . nil) ) ;; SELECT - - (OPTION+) -(INPUT|KEYGEN|TEXTAREA|SELECT)> ;; *** This should be -(everything). ((select) (content-model . [((option) nil nil nil)]) (exclusions . (input label select keygen textarea))) ;; option - O (#PCDATA) ;; needs to be #PCDATA to allow omitted end tag. ((option) ;; I'd like to make this RCDATA to avoid problems with inclusions ;; like SPOT, but that would conflict with the omitted end-tag, I ;; think. (content-model . [((*data) include-space (((option) *close)) nil)]) (end-tag-omissible . t)) ;; TEXTAREA - - (#PCDATA) -(INPUT|TEXTAREA|KEYGEN|SELECT) ((textarea) ;; Same comment as for OPTION about RCDATA. (content-model . [((*data) include-space nil nil)]) (exclusions . (input select label keygen textarea))) ((hr br img isindex input keygen overlay wbr spot tab %headempty %mathdelims) (content-model . EMPTY)) ((nextid) (content-model . EMPTY) (deprecated . t)) ((a) (content-model . [((%text) include-space (((%heading) *include *same "deprecated inside A") ;; *** I haven't made up my mind whether this ;; is a good idea. It can result in a lot of ;; bad formatting if the A is *never* closed. ;;((p) *discard *same error) ) nil)]) (exclusions . (a))) ((b font %font %phrase %misc nobr) (content-model . [((%text) include-space ((%in-text-ignore)) nil)])) ((plaintext) (content-model . XXCDATA) (end-tag-omissible . t) (deprecated . obsolete)) ((xmp listing) (content-model . XCDATA) (deprecated . obsolete)) ;; Latest table spec (as of Nov. 13 1995) is at: ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-tables-03.txt> ((table) (content-model . [(nil nil (((caption) *include *next) ((col colgroup thead tfoot tbody tr) *retry *next)) (*retry *next)) ;error handling ((col colgroup) nil (((thead tfoot tbody tr) *retry *next)) (*retry *next)) ;error handling (nil nil (((thead) *include *next) ((tfoot tbody tr) *retry *next)) (*retry *next)) ;error handling (nil nil (((tfoot) *include *next) ((tbody tr) *retry *next)) (*retry *next)) ;error handling ((tbody) nil (((tr) tbody *same) ;; error handling ((%body.content) tbody *same error)) nil)])) ((colgroup) (content-model . [((col) nil (((colgroup thead tfoot tbody tr) *close)) nil)]) (end-tag-omissible . t)) ((col) (content-model . EMPTY)) ((thead) (content-model . [((tr) nil (((tfoot tbody) *close) ;; error handling ((%body.content) tr *same error)) nil)]) (end-tag-omissible . t)) ((tfoot tbody) (content-model . [((tr) nil (((tbody) *close) ;; error handling ((%body.content) tr *same error)) nil)]) (end-tag-omissible . t)) ((tr) (content-model . [((td th) nil (((tr tfoot tbody) *close) ;; error handling ((%body.content) td *same error)) nil)]) (end-tag-omissible . t)) ((td th) ;; Arrgh! Another %body.content!!! Stupid!!! (content-model . [((%body.content) nil (((td th tr tfoot tbody) *close) ;; Push <P> before data characters. Non-SGML. ((%text) p)) nil)]) (end-tag-omissible . t)) ((math) (content-model . [((*data) include-space nil nil)]) (overrides . ((w3-p-d-shortref-chars t . "\{_^") (w3-p-d-shortrefs t . (("\\^" . "<sup>") ("_" . "<sub>") ("{" . "<box>"))))) (inclusions . (%math)) (exclusions . (%notmath))) ((sup) (content-model . [((%text) include-space ((%in-text-ignore)) nil)]) (overrides . ((w3-p-d-shortref-chars t . "\{_^") (w3-p-d-shortrefs t . (("\\^" . "</sup>") ("_" . "<sub>") ("{" . "<box>")))))) ((sub) (content-model . [((%text) include-space ((%in-text-ignore)) nil)]) (overrides . ((w3-p-d-shortref-chars t . "\{_^") (w3-p-d-shortrefs t . (("\\^" . "<sup>") ("_" . "</sub>") ("{" . "<box>")))))) ((box) (content-model . [((%formula) include-space (((left) *include 1) ((over atop choose) *include 2) ((right) *include 3)) nil) ((%formula) include-space (((over atop choose) *include 2) ((right) *include 3)) nil) ((%formula) include-space (((right) *include 3)) nil) ((%formula) include-space nil nil)]) (overrides . ((w3-p-d-shortref-chars t . "{}_^") (w3-p-d-shortrefs t . (("\\^" . "<sup>") ("_" . "<sub>") ("{" . "<box>") ("}" . "</box>")))))) ((above below %mathvec t bt sqrt) (content-model . [((%formula) include-space nil nil)])) ;; ROOT has a badly-specified content-model in HTML 3.0. ((root) (content-model . [((%formula) include-space (((of) *include *next)) nil) ((%formula) include-space nil nil)])) ((of) (content-model . [((%formula) include-space nil nil)]) ;; There is no valid way to infer a missing end-tag for OF. This ;; is bizarre. (end-tag-omissible . t)) ((array) (content-model . [((row) nil nil nil)])) ((row) (content-model . [((item) nil (((row) *close)) nil)]) (end-tag-omissible . t)) ((item) (content-model . [((%formula) include-space (((row item) *close)) nil)]) (end-tag-omissible . t)) ;; The old parser would look for the </EMBED> end-tag and include ;; the contents between <EMBED> and </EMBED> as the DATA attribute ;; of the EMBED start-tag. However, it did not require the ;; </EMBED> end-tag and did nothing if it was missing. This is ;; completely impossible to specify in SGML. ;; ;; See ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0603.html> ;; ;; Questions: Does EMBED require the end-tag? How does NOEMBED fit ;; into this? Where can EMBED appear? ;; ;; Nov. 25 1995: a new spec for EMBED (also an I-D): ;; <URL:http://www.cs.princeton.edu/~burchard/www/interactive/> ;; ;; Here is my guess how to code EMBED: ((embed) (content-model . [((noembed) nil nil (*close))])) ((noembed) (content-model . [((%body.content) ; hack hack hack nil (((%text) p)) nil)])) ;; ;; FRAMESET is a Netscape thing. ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0588.html> ((frameset) (content-model . [((noframes frame frameset) nil nil nil)])) ((noframes) (content-model . [((%body.content) nil ;; Push <P> before data characters. Non-SGML. (((%text) p)) nil)])) ((frame) (content-model . EMPTY)) ;; ;; APPLET is a Java thing. ;; OBJECT is a cougar thing ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README> ((applet object) ;; I really don't want to add another ANY content-model. (content-model . XINHERIT) (inclusions . (param))) ((param) (content-model . EMPTY)) ;; backward compatibility with old Java. ((app) (content-model . EMPTY)) ;; Client-side image maps. ;; <URL:ftp://ds.internic.net/internet-drafts/draft-seidman-clientsideimagemap-01.txt> ;; *** The only problem is that I don't know in what elements MAP ;; can appear, so none of this is reachable yet. ((map) (content-model . [((area) nil nil nil)])) ((area) (content-model . EMPTY)) ))))) ;;; ;;; Omitted tag inference using state transition tables. ;;; (eval-when-compile (w3-p-s-var-def w3-p-s-includep) (w3-p-s-var-def w3-p-s-state-transitions) (w3-p-s-var-def w3-p-s-transition) (w3-p-s-var-def w3-p-s-tran-list) (w3-p-s-var-def w3-p-s-content-model) (w3-p-s-var-def w3-p-s-except) (w3-p-s-var-def w3-p-s-baseobject) (w3-p-s-var-def w3-p-s-btdt) ;; Uses free variables: ;; w3-p-d-current-element, w3-p-d-exceptions ;; Destroys free variables: ;; w3-p-s-includep, w3-p-s-state-transitions, w3-p-s-transition, ;; w3-p-s-tran-list, w3-p-s-content-model, w3-p-s-except ;; Returns t if the element or data characters should be included. ;; Returns nil if the element or data characters should be discarded. (defsubst w3-grok-tag-or-data (tag-name) (while (cond ((symbolp (setq w3-p-s-content-model (w3-element-content-model w3-p-d-current-element))) (or (and (memq w3-p-s-content-model '(CDATA RCDATA XCDATA XXCDATA)) (memq tag-name '(*data *space))) ;; *** Implement ANY. (error "impossible content model lossage")) (setq w3-p-s-includep t) ;; Exit loop. nil) (t ;; We have a complex content model. ;; Cache some data from the element info structure. Format is: ;; (INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT) (setq w3-p-s-state-transitions (aref w3-p-s-content-model (w3-element-state w3-p-d-current-element))) ;; Optimize the common cases. (cond ((eq '*space tag-name) ;; Optimizing the (*space *discard *same nil) transition. (setq w3-p-s-includep (car (cdr w3-p-s-state-transitions))) ;; Don't loop. nil) ((and (not (setq w3-p-s-except (assq tag-name w3-p-d-exceptions))) (memq tag-name (car w3-p-s-state-transitions))) ;; Equivalent to a transition of (TAG *include *same nil). ;; So we are done, return t to caller. (setq w3-p-s-includep t) ;; Exit loop. nil) (t ;; The general case. (cond ;; Handle inclusions and exclusions. (w3-p-s-except (setq w3-p-s-transition (cdr w3-p-s-except))) ;; See if the transition is in the complex transitions ;; component. ((progn (setq w3-p-s-tran-list (car (cdr (cdr w3-p-s-state-transitions)))) (setq w3-p-s-transition nil) (while w3-p-s-tran-list (cond ((memq tag-name (car (car w3-p-s-tran-list))) ;; We've found a transition. (setq w3-p-s-transition (cdr (car w3-p-s-tran-list))) (setq w3-p-s-tran-list nil)) (t (setq w3-p-s-tran-list (cdr w3-p-s-tran-list))))) ;; Check if we found it. w3-p-s-transition) ;; body of cond clause empty ) ;; Try finding the transition in the DEFAULT component of the ;; transition table, but avoid doing this for unknown elements, ;; always use the default-default for them. ((and (or (eq '*data tag-name) (w3-known-element-p tag-name)) (setq w3-p-s-transition (nth 3 w3-p-s-state-transitions))) ;; body of cond clause empty ) (t ;; Supply a default-default transition. (if (not (or (eq '*data tag-name) (w3-known-element-p tag-name))) (setq w3-p-s-transition '(*discard *same "unknown element")) ;; Decide whether to *close or *discard ;; based on whether this element would be ;; accepted as valid in an open ancestor. (let ((open-list w3-p-d-open-element-stack) (all-end-tags-omissible (w3-element-end-tag-omissible w3-p-d-current-element)) state-transitions tran-list) (if (catch 'found (while open-list (setq state-transitions (aref (w3-element-content-model (car open-list)) (w3-element-state (car open-list)))) (if (memq tag-name (car state-transitions)) (throw 'found t)) (setq tran-list (nth 2 state-transitions)) (while tran-list (cond ((memq tag-name (car (car tran-list))) (if (not (nth 3 (car tran-list))) ;; Not an error transition. (throw 'found t)) (setq tran-list nil)) (t (setq tran-list (cdr tran-list))))) ;; The input item is not accepted in this ;; ancestor. Try again in next ancestor. (or (w3-element-end-tag-omissible (car open-list)) (setq all-end-tags-omissible nil)) (setq open-list (cdr open-list))) nil) (setq w3-p-s-transition (if (w3-element-end-tag-omissible w3-p-d-current-element) (if all-end-tags-omissible ;; Probably indicates a need to debug ;; the DTD state-transition tables. '(*close *same "missing transition in DTD?") ;; Error will be reported later. '(*close *same)) '(*close *same "not allowed here"))) (setq w3-p-s-transition '(*discard *same "not allowed here"))))))) ;; We have found a transition to take. The transition is of ;; the format (ACTION NEW-STATE ERRORP) where the latter two ;; items are optional. ;; First, handle any state-change. (or (memq (car-safe (cdr w3-p-s-transition)) '(nil *same)) (w3-set-element-state w3-p-d-current-element (if (eq '*next (car-safe (cdr w3-p-s-transition))) (1+ (w3-element-state w3-p-d-current-element)) (car-safe (cdr w3-p-s-transition))))) ;; Handle any error message. (if (car-safe (cdr-safe (cdr w3-p-s-transition))) (w3-debug-html :mandatory-if (and (eq '*data tag-name) (eq '*discard (car w3-p-s-transition))) (format "Bad %s [%s], %s" (if (eq '*data tag-name) "data characters" (concat "start-tag " (w3-sgml-name-to-string tag-name))) (if (stringp (car (cdr (cdr w3-p-s-transition)))) (car (cdr (cdr w3-p-s-transition))) "not allowed here") (let ((action (car w3-p-s-transition))) (cond ((eq '*discard action) "discarding bad item") ((eq '*close action) (concat "inferring </" (w3-sgml-name-to-string (w3-element-name w3-p-d-current-element)) ">")) ((eq '*include action) "including bad item anyway") ((eq '*retry action) "*retry ??? you shouldn't see this") (t (concat "inferring <" (w3-sgml-name-to-string action) ">"))))))) ;; Handle the action. (cond ((eq '*include (car w3-p-s-transition)) (setq w3-p-s-includep t) ;; Exit loop. nil) ((eq '*close (car w3-p-s-transition)) ;; Perform end-tag inference. (w3-close-element) ; don't pass parameter ;; Loop and try again in parent element's content-model. t) ((eq '*discard (car w3-p-s-transition)) (setq w3-p-s-includep nil) ;; Exit loop. nil) ((eq '*retry (car w3-p-s-transition)) ;; Loop and try again after state change. t) ((symbolp (car w3-p-s-transition)) ;; We need to open another element to contain the text, ;; probably a <P> (look in the state table). (w3-open-element (car w3-p-s-transition) nil) ;; Now we loop and try again in the new element's ;; content-model. t) (t (error "impossible transition"))))))) ;; Empty while loop body. ) ;; Return value to user indicating whether to include or discard item: ;; t ==> include ;; nil ==> discard w3-p-s-includep) ) ;;; ;;; Main parser. ;;; (defvar w3-last-parse-tree nil "Used for debugging only. Stores the most recently computed parse tree \(a tree, not a parse tag stream\).") (defun w3-display-parse-tree (&optional ptree) (interactive) (with-output-to-temp-buffer "W3 HTML Parse Tree" (set-buffer standard-output) (emacs-lisp-mode) (require 'pp) (pp (or ptree w3-last-parse-tree)))) (defalias 'w3-display-last-parse-tree 'w3-display-parse-tree) ;; For compatibility with the old parser interface. (defalias 'w3-preparse-buffer 'w3-parse-buffer) ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ;; % % ;; % This is the *ONLY* valid entry point in this file! % ;; % DO NOT call any of the other functions! % ;; % % ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% (defun w3-parse-buffer (&optional buff) "Parse contents of BUFF as HTML. BUFF defaults to the value of url-working-buffer. Destructively alters contents of BUFF. Returns a data structure containing the parsed information." (set-buffer (or buff url-working-buffer)) (setq buff (current-buffer)) (set-syntax-table w3-sgml-md-syntax-table) (buffer-disable-undo (current-buffer)) (widen) ; sanity checking (goto-char (point-min)) (setq case-fold-search t) ; allows smaller regexp patterns ;; Some unknown pre-parse buffer munging. (if (fboundp 'sera-to-fidel-marker) (let ((sera-being-called-by-w3 t)) ;; eval stops the compiler from complaining. (eval '(sera-to-fidel-marker)))) (goto-char (point-min)) ;; *** Should premunge line boundaries. ;; ******************** (let* ( ;; Speed hack, see the variable doc string. (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0) (* w3-gc-cons-threshold-multiplier gc-cons-threshold) gc-cons-threshold)) ;; Used to determine if we made any progress since the last loop. (last-loop-start (point-min)) ;; How many iterations of the main loop have occurred. Used only ;; to send messages to the user periodically, since this function ;; can take some time. (loop-count 0) ;; Precomputing the loop-invariant parts of this for speed. (status-message-format (if url-show-status (format "Parsed %%3d%%%% of %d..." (- (point-max) (point-min))))) ;; Use a float value for 100 if possible, otherwise integer. ;; Determine which we can use outside of the loop for speed. (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100)) ;; The buffer which contains the HTML we are parsing. This ;; variable is used to avoid using the more expensive ;; save-excursion. (parse-buffer (current-buffer)) ;; Points to start of region of text since the previous tag. (between-tags-start (point-min)) ;; Points past end of region of text since the previous tag. Only ;; non-nil when the region has been completely determined and is ;; ready to be processed. between-tags-end ;; See doc string. w3-p-d-tag-name ;; See doc string. w3-p-d-end-tag-p ;; Is the tag we are looking at a null-end-tag-enabling ;; start-tag? net-tag-p ;; Attributes of the tag we are looking at. An alist whose items ;; are pairs of the form (SYMBOL . STRING). tag-attributes ;; Points past end of attribute value we are looking at. Points ;; past the syntactic construct, not the value of the attribute, ;; which may be at (1- attribute-value-end). attribute-value-end ;; Points past end of tag we are looking at. tag-end ;; See doc string. (w3-p-d-current-element (w3-fresh-element-for-tag '*document)) ;; See doc string. (w3-p-d-open-element-stack (list (w3-fresh-element-for-tag '*holder))) ;; ***not implemented yet*** (marked-section-undo-stack nil) ;; See doc string. (w3-p-d-debug-url t) ;; Any of the following variables with the comment ";*NESTED*" ;; are syntactic or semantic features that were introduced by ;; some containing element or marked section which will be undone ;; when we close that element or marked section. ;; See doc string. (w3-p-d-non-markup-chars nil) ;*NESTED* ;; See doc string. (w3-p-d-null-end-tag-enabled nil) ;*NESTED* ;; See doc string. (w3-p-d-in-parsed-marked-section nil) ;*NESTED* ;; See doc string. (w3-p-d-shortrefs nil) ;*NESTED* ;; See doc string. (w3-p-d-shortref-chars nil) ;*NESTED* ;; ******* maybe not needed. ;; ;; ;; Are we recognizing start-tags? ;; (recognizing-start-tags t) ;*NESTED* ;; ;; ;; Are we recognizing end-tags? If this is non-nil and not t, ;; ;; then only the end tag of the current open element is ;; ;; recognized. ;; (recognizing-end-tags t) ;*NESTED* ;; See doc string. (w3-p-d-exceptions nil) ;*NESTED* ;; Scratch variables used in this function ref attr-name attr-value content-model content open-list ) ;; Scratch variables used by macros and defsubsts we call. (w3-p-s-let-bindings (w3-update-non-markup-chars) (setq w3-p-s-baseobject (url-generic-parse-url (url-view-url t))) ;; Main loop. Handle markup as follows: ;; ;; non-empty tag: Handle the region since the previous tag as PCDATA, ;; RCDATA, CDATA, if allowed by syntax. Then handle the tag. ;; ;; general entity (&name;): expand it and parse the result. ;; ;; shortref (_, {, }, and ^ in math stuff): Expand it and parse the ;; result. ;; ;; SGML marked section (<![ keywords [ conditional-text ]]>): Either ;; strip the delimiters and parse the result or delete. ;; ;; comment: Delete. ;; ;; empty tag (<>, </>): Handle as the appropriate tag. ;; ;; markup declaration (e.g. <!DOCTYPE ...>): Delete. ;; ;; SGML processing instruction (<?name>): Delete. ;; (while ;; Continue as long as we processed something last time and we ;; have more to process. (prog1 (not (and (= last-loop-start (point)) (eobp))) (setq last-loop-start (point))) ;; Display progress messages if asked and/or do incremental display ;; of results (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40)) (if status-message-format (message status-message-format ;; Percentage of buffer processed. (/ (* (point) one-hundred) (point-max)))))) ;; Go to next interesting thing in the buffer. (skip-chars-forward w3-p-d-non-markup-chars) ;; We are looking at a markup-starting character, and invalid ;; character, or end of buffer. (cond ((= ?< (following-char)) ;; We are looking at a tag, comment, markup declaration, SGML marked ;; section, SGML processing instruction, or non-markup "<". (forward-char) (cond ((looking-at "/?\\([a-z][-a-z0-9.]*\\)") ;; We are looking at a non-empty tag. ;; Downcase it in the buffer, to save creation of a string (downcase-region (match-beginning 1) (match-end 1)) (setq w3-p-d-tag-name (intern (buffer-substring (match-beginning 1) (match-end 1)))) (setq w3-p-d-end-tag-p (= ?/ (following-char))) (setq between-tags-end (1- (point))) (goto-char (match-end 0)) ;; Read the attributes from a start-tag. (if w3-p-d-end-tag-p (if (looking-at "[ \t\r\n/]*>") nil ;; This is in here to deal with those idiots who stick ;; attribute/value pairs on end tags. *sigh* (w3-debug-html "Evil attributes on end tag.") (skip-chars-forward "^>")) ;; Attribute values can be: ;; "STRING" where STRING does not contain the double quote ;; 'STRING' where STRING does not contain the single quote ;; name-start character, *name character ;; *name character ;; Digit, +name character ;; +Digit ;; or a SPACE-separated list of one of the last four ;; possibilities (there is a comment somewhere that this is a ;; misinterpretation of the grammar, so we ignore this ;; possibility). (while (looking-at (eval-when-compile (concat ;; Leading whitespace. "[ \n\r\t]*" ;; The attribute name, possibly with a bad syntax ;; component. "\\([a-z_][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)" ;; Trailing whitespace and perhaps an "=". "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)"))) (cond ((/= (match-beginning 2) (match-end 2)) (w3-debug-html :nocontext (format "Bad attribute name syntax: %s" (buffer-substring (match-beginning 1) (match-end 1)))))) ;; Downcase it in the buffer, to save creation of a string (downcase-region (match-beginning 1) (match-end 1)) (setq attr-name (intern (buffer-substring (match-beginning 1) (match-end 1)))) (goto-char (match-end 0)) (cond ((< (match-beginning 4) (match-end 4)) ;; A value was specified (e.g. ATTRIBUTE=VALUE). (cond ((looking-at (eval-when-compile (concat ;; Literal with double quotes. "\"\\([^\"]*\\)\"" "\\|" ;; Literal with single quotes. "'\\([^']*\\)'" "\\|" ;; Handle bad HTML conflicting with NET-enabling ;; start-tags. "\\([-a-z0-9.]+/[-a-z0-9._/#]+\\)[ \t\n\r>]" "\\|" ;; SGML NAME-syntax attribute value. "\\([-a-z0-9.]+\\)[ \t\n\r></]" ))) (cond ((or (match-beginning 1) (match-beginning 2)) ;; We have an attribute value literal. (narrow-to-region (1+ (match-beginning 0)) (1- (match-end 0))) ;; In attribute value literals, EE and RS are ignored ;; and RE and SEPCHAR characters sequences are ;; replaced by SPACEs. ;; ;; (There is no way right now to get RS into one of ;; these so that it can be ignored. This is due to ;; our using Unix line-handling conventions.) (skip-chars-forward "^&\t\n\r") (if (eobp) nil ;; We must expand entities and replace RS, RE, ;; and SEPCHAR. (goto-char (point-min)) (while (progn (skip-chars-forward "^&") (not (eobp))) (w3-expand-entity-at-point-maybe)) (subst-char-in-region (point-min) (point-max) ?\t ? ) (subst-char-in-region (point-min) (point-max) ?\n ? )) ;; Set this after we have changed the size of the ;; attribute. (setq attribute-value-end (1+ (point-max)))) ((match-beginning 4) (setq attribute-value-end (match-end 4)) (narrow-to-region (point) attribute-value-end)) ((match-beginning 3) (setq attribute-value-end (match-end 3)) (narrow-to-region (point) attribute-value-end) ;; Horribly illegal non-SGML handling of bad ;; HTML on the net. This can break valid HTML. (setq attr-value (buffer-substring (point) (match-end 3))) (w3-debug-html :nocontext (format "Evil attribute value syntax: %s" (buffer-substring (point-min) (point-max))))) (t (error "impossible attribute value")))) ((memq (following-char) '(?\" ?')) ;; Missing terminating quote character. (narrow-to-region (point) (progn (forward-char 1) (skip-chars-forward "^ \t\n\r'\"<>") (setq attribute-value-end (point)))) (w3-debug-html :nocontext (format "Attribute value missing end quote: %s" (buffer-substring (point-min) (point-max)))) (narrow-to-region (1+ (point-min)) (point-max))) (t ;; We have a syntactically invalid attribute value. Let's ;; make a best guess as to what the author intended. (narrow-to-region (point) (progn (skip-chars-forward "^ \t\n\r'\"<>") (setq attribute-value-end (point)))) (w3-debug-html :nocontext (format "Bad attribute value syntax: %s" (buffer-substring (point-min) (point-max)))))) ;; Now we have isolated the attribute value. We need to ;; munge the value depending on the syntax of the ;; attribute. ;; *** Right now, we only implement the necessary munging ;; for CDATA attributes, which is none. I'm not sure why ;; this happens to work for other attributes right now. ;; For any other kind of attribute, we are supposed to ;; * smash case ;; * remove leading/trailing whitespace ;; * smash multiple space sequences into single spaces ;; * verify the syntax of each token (setq attr-value (buffer-substring (point-min) (point-max))) (case attr-name (class (setq attr-value (split-string attr-value "[ ,]+"))) (align (if (string-match "^[ \t\r\n]*\\(.*\\)[ \t\r\n]*$" attr-value) (setq attr-value (downcase (substring attr-value (match-beginning 1) (match-end 1)))) (setq attr-value (downcase attr-value))) (setq attr-value (intern attr-value))) ((src href) ;; I should expand URLs here ) (otherwise nil) ) (widen) (goto-char attribute-value-end)) (t ;; No value was specified, in which case NAME should be ;; taken as ATTRIBUTE=NAME where NAME is one of the ;; enumerated values for ATTRIBUTE. ;; We assume here that ATTRIBUTE is the same as NAME. ;; *** Another piece of code will fix the attribute name if it ;; is wrong. (setq attr-value (symbol-name attr-name)))) ;; Accumulate the attributes. (setq tag-attributes (cons (cons attr-name attr-value) tag-attributes))) (cond ((and (eq w3-p-d-tag-name 'base) (setq w3-p-s-baseobject (or (assq 'src tag-attributes) (assq 'href tag-attributes)))) (setq w3-p-s-baseobject (url-generic-parse-url (cdr w3-p-s-baseobject)))) ((setq w3-p-s-btdt (or (assq 'src tag-attributes) (assq 'href tag-attributes) (assq 'action tag-attributes))) (setcdr w3-p-s-btdt (url-expand-file-name (cdr w3-p-s-btdt) w3-p-s-baseobject)) (setq w3-p-s-btdt (if (url-have-visited-url (cdr w3-p-s-btdt)) ":visited" ":link")) (if (assq 'class tag-attributes) (setcdr (assq 'class tag-attributes) (cons w3-p-s-btdt (cdr (assq 'class tag-attributes)))) (setq tag-attributes (cons (cons 'class (list w3-p-s-btdt)) tag-attributes)))) ) ) ;; Process the end of the tag. (skip-chars-forward " \t\n\r") (cond ((= ?> (following-char)) ;; Ordinary tag end. (forward-char 1)) ((and (= ?/ (following-char)) (not w3-p-d-end-tag-p)) ;; This is a NET-enabling start-tag. (setq net-tag-p t) (forward-char 1)) ((= ?< (following-char)) ;; *** Strictly speaking, the following text has to ;; lexically be STAGO or ETAGO, which means that it ;; can't match some other lexical unit. ;; Unclosed tag. nil) (t ;; Syntax error. (w3-debug-html (format "Bad unclosed %s%s tag" (if w3-p-d-end-tag-p "/" "") (w3-sgml-name-to-string w3-p-d-tag-name))))) (setq tag-end (point))) ((looking-at "/?>") ;; We are looking at an empty tag (<>, </>). (setq w3-p-d-end-tag-p (= ?/ (following-char))) (setq w3-p-d-tag-name (if w3-p-d-end-tag-p (w3-element-name w3-p-d-current-element) ;; *** Strictly speaking, if OMITTAG NO, then ;; we should use the most recently closed tag. ;; But OMITTAG YES in HTML and I'm lazy. (w3-element-name w3-p-d-current-element))) (setq tag-attributes nil) ;; *** Make sure this is not at top level. (setq between-tags-end (1- (point))) (setq tag-end (match-end 0))) ;; *** In SGML, <(doctype)element> is valid tag syntax. This ;; cannot occur in HTML because the CONCUR option is off in the ;; SGML declaration. ((looking-at "!--") ;; We found a comment, delete to end of comment. (delete-region (1- (point)) (progn (forward-char 1) ;; Skip over pairs of -- ... --. (if (looking-at "\\(--[^-]*\\(-[^-]+\\)*--[ \t\r\n]*\\)+>") (goto-char (match-end 0)) ;; Syntax error! (w3-debug-html "Bad comment (unterminated or unbalanced \"--\" pairs)") (forward-char 2) (or (re-search-forward "--[ \t\r\n]*>" nil t) (search-forward ">" nil t))) (point)))) ((looking-at "!>\\|\\?[^>]*>") ;; We are looking at an empty comment or a processing ;; instruction. Delete it. (replace-match "") (delete-char -1)) ((looking-at "![a-z]") ;; We are looking at a markup declaration. Delete it. ;; *** Technically speaking, to handle valid HTML I think we ;; need to handle "<!USEMAP ... >" declarations. In the future, ;; to handle general SGML, we should parse "<!DOCTYPE ... >" ;; declarations as well (which can contain other declarations). ;; In the very distant future, perhaps we will handle "<!SGML ;; ... >" declarations. ;; *** Should warn if it's not SGML, DOCTYPE, or USEMAP. (backward-char 1) (delete-region (point) (progn (condition-case nil (forward-sexp 1) (error ;; *** This might not actually be bad syntax, but might ;; instead be a -- ... -- comment with unbalanced ;; parentheses somewhere inside the declaration. Handling ;; this properly would require full parsing of markup ;; declarations, a goal for the future. (w3-debug-html "Bad <! syntax.") (skip-chars-forward "^>") (if (= ?> (following-char)) (forward-char)))) (point)))) ((looking-at "!\\\[\\(\\([ \t\n\r]*[a-z]+\\)+[ \t\n\r]*\\)\\\[") ;; We are looking at a marked section. ;; *** Strictly speaking, we should issue a warning if the ;; keywords are invalid or missing or if the "[" does not follow. ;; We must look at the keywords to understand how to parse it. ;; *** Strictly speaking, we should perform parameter entity ;; substitution on the keywords first. (goto-char (match-beginning 1)) (insert ?\)) (goto-char (1- (match-beginning 0))) (delete-char 3) (insert ?\() (backward-char 1) (let* ((keywords (read (current-buffer))) ;; Multiple keywords may appear, but only the most ;; significant takes effect. Rank order is IGNORE, CDATA, ;; RCDATA, INCLUDE, and TEMP. INCLUDE and TEMP have the ;; same effect. (keyword (car-safe (cond ((memq 'IGNORE keywords)) ((memq 'CDATA keywords)) ((memq 'RCDATA keywords)) ((memq 'INCLUDE keywords)) ((memq 'TEMP keywords)))))) (or (= ?\[ (following-char)) ;; I probably shouldn't even check this, since it is so ;; impossible. (error "impossible ??")) (forward-char 1) (delete-region (1- (match-beginning 0)) (point)) (cond ((eq 'IGNORE keyword) ;; Scan forward skipping over matching <![ ... ]]> ;; until we find an unmatched "]]>". (let ((ignore-nesting 1) (start-pos (point))) (while (> ignore-nesting 0) (if (re-search-forward "<!\\\\\[\\|\]\]>" nil t) (setq ignore-nesting (if (eq ?> (preceding-char)) (1- ignore-nesting) (1+ ignore-nesting))) (w3-debug-html "Unterminated IGNORE marked section.") (setq ignore-nesting 0) (goto-char start-pos))) (delete-region start-pos (point)))) ((eq 'CDATA keyword) (error "***unimplemented***")) ((eq 'RCDATA keyword) (error "***unimplemented***")) ((memq keyword '(INCLUDE TEMP)) (error "***unimplemented***"))))) ((and (looking-at "!") w3-netscape-compatible-comments) ;; Horribly illegal non-SGML handling of bad HTML on the net. ;; This can break valid HTML. ;; This arises because Netscape discards anything looking like ;; "<!...>". So people expect they can use this construct as ;; a comment. (w3-debug-html "Evil <! comment syntax.") (backward-char 1) (delete-region (point) (progn (skip-chars-forward "^>") (if (= ?> (following-char)) (forward-char)) (point)))) (t ;; This < is not a markup character. Pretend we didn't notice ;; it at all. We have skipped over the < already, so just loop ;; again. ))) ((= ?& (following-char)) (w3-expand-entity-at-point-maybe)) ((and (= ?\] (following-char)) w3-p-d-in-parsed-marked-section (looking-at "]]>")) ;; *** handle the end of a parsed marked section. (error "***unimplemented***")) ((and (= ?/ (following-char)) w3-p-d-null-end-tag-enabled) ;; We are looking at a null end tag. (setq w3-p-d-end-tag-p t) (setq between-tags-end (point)) (setq tag-end (1+ (point))) (setq w3-p-d-tag-name (w3-element-name w3-p-d-current-element))) ;; This can be slow, since we'll hardly ever get here. ;; *** Strictly speaking, I think we're supposed to handle ;; shortrefs that begin with the same characters as other markup, ;; preferring the longest match. ;; I will assume that shortrefs never begin with <, &, \], /. ((setq ref (catch 'found-shortref (let ((refs w3-p-d-shortrefs)) (while refs (if (looking-at (car (car refs))) (throw 'found-shortref (cdr (car refs)))) (setq refs (cdr refs)))))) ;; We are looking at a shortref for which there is an ;; expansion defined in the current syntax. Replace with the ;; expansion, leaving point at the beginning so it will be parsed ;; on the next loop. ;; *** eek. This is wrong if the shortref is for an entity with ;; CDATA syntax which should not be reparsed for tags. (replace-match "") (let ((pt (point))) (insert ref) (goto-char pt))) ((looking-at (eval-when-compile (concat "[" (w3-invalid-sgml-chars) "]"))) (w3-debug-html (format "Invalid SGML character: %c" (following-char))) (insert (or (cdr-safe (assq (following-char) ;; These characters are apparently ;; from a Windows character set. '((146 . "'") (153 . "TM")))) "")) (delete-char 1)) ((eobp) ;; We have finished the buffer. Make sure we process the last ;; piece of text, if any. (setq between-tags-end (point)) ;; We have to test what's on the element stack because this ;; piece of code gets executed twice. (cond ((not (eq '*holder (w3-element-name w3-p-d-current-element))) ;; This forces the calculation of implied omitted end tags. (setq w3-p-d-tag-name '*document) (setq w3-p-d-end-tag-p t) (setq tag-end (point))))) (t (error "unreachable code, this can't happen"))) ;; If we have determined the boundaries of a non-empty between-tags ;; region of text, then handle it. (cond (between-tags-end (cond ((< between-tags-start between-tags-end) ;; We have a non-empty between-tags region. ;; We check if it's entirely whitespace, because we record the ;; transitions for whitespace separately from those for ;; data with non-whitespace characters. (goto-char between-tags-start) (skip-chars-forward " \t\n\r" between-tags-end) (cond ((w3-grok-tag-or-data (prog1 (if (= between-tags-end (point)) '*space '*data) (goto-char between-tags-end))) ;; We have to include the text in the current element's ;; contents. If this is the first item in the current ;; element's contents, don't include a leading newline if ;; there is one. Add a trailing newline as a separate text ;; item so that it can be removed later if it turns out to ;; be the last item in the current element's contents when ;; the current element is closed. ;; *** We could perform this test before calling ;; w3-grok-tag-or-data, but it's not clear which will be ;; faster in practice. (or (setq content (w3-element-content w3-p-d-current-element)) ;; *** Strictly speaking, in SGML the record end is ;; carriage return, not line feed. (if (= ?\n (char-after between-tags-start)) (setq between-tags-start (1+ between-tags-start)))) (if (= between-tags-start (point)) ;; Do nothing. nil ;; We are definitely going to add data characters to the ;; content. (cond ((and (= ?\n (preceding-char)) (/= between-tags-start (1- (point)))) (setq content (cons (buffer-substring between-tags-start (1- (point))) content)) (setq content (cons "\n" content))) (t (setq content (cons (buffer-substring between-tags-start (point)) content)))) (w3-set-element-content w3-p-d-current-element content)))))) (setq between-tags-end nil))) ;; If the previous expression modified (point), then it went to ;; the value of between-tags-end. ;; If we found a start or end-tag, we need to handle it. (cond (w3-p-d-tag-name ;; Move past the tag and prepare for next between-tags region. (goto-char tag-end) (setq between-tags-start (point)) (cond (w3-p-d-end-tag-p ;; Handle an end-tag. (if (eq w3-p-d-tag-name (w3-element-name w3-p-d-current-element)) (w3-close-element) ;; Handle the complex version. We have to search up (down?) ;; the open element stack to find the element that matches (if ;; any). Then we close all of the elements. On a conforming ;; SGML document this can do no wrong and it's not ;; unreasonable on a non-conforming document. ;; Can't safely modify stack until we know the element we want ;; to find is in there, so work with a copy. (setq open-list w3-p-d-open-element-stack) (while (and open-list (not (eq w3-p-d-tag-name (w3-element-name (car open-list))))) (setq open-list (cdr open-list))) (cond (open-list ;; We found a match. Pop elements. ;; We will use the following value as a sentinel. (setq open-list (cdr open-list)) (while (not (eq open-list w3-p-d-open-element-stack)) (w3-close-element t)) (w3-close-element)) (t ;; Bogus end tag. (w3-debug-html (format "Unmatched end-tag </%s>" (w3-sgml-name-to-string w3-p-d-tag-name))))))) (t ;; Handle a start-tag. (cond ;; Check if the new element is allowed in the current element's ;; content model. ((w3-grok-tag-or-data w3-p-d-tag-name) (w3-open-element w3-p-d-tag-name tag-attributes) ;; Handle NET-enabling start tags. (cond ((and net-tag-p (not w3-p-d-null-end-tag-enabled)) ;; Save old values. (w3-set-element-undo-list w3-p-d-current-element (cons (cons 'w3-p-d-non-markup-chars w3-p-d-non-markup-chars) (cons '(w3-p-d-null-end-tag-enabled . nil) (w3-element-undo-list w3-p-d-current-element)))) ;; Alter syntax. (setq w3-p-d-null-end-tag-enabled t) (w3-update-non-markup-chars))) (setq content-model (w3-element-content-model w3-p-d-current-element)) ;; If the element does not have parsed contents, then we ;; can find its contents immediately. (cond ((memq content-model '(EMPTY CDATA XCDATA XXCDATA RCDATA)) (cond ((eq 'EMPTY content-model) (w3-close-element)) ((eq 'CDATA content-model) ;; CDATA: all data characters until an end-tag. We'll ;; process the end-tag on the next loop. (if (re-search-forward (if w3-p-d-null-end-tag-enabled "</[a-z>]\\|/" "</[a-z>]") nil 'move) (goto-char (match-beginning 0)))) ((eq 'XCDATA content-model) ;; XCDATA: special non-SGML-standard mode which includes ;; all data characters until "</foo" is seen where "foo" ;; is the name of this element (for XMP and LISTING). (if (search-forward (concat "</" (symbol-name (w3-element-name w3-p-d-current-element))) nil 'move) (goto-char (match-beginning 0)))) ((eq 'XXCDATA content-model) ;; XXCDATA: special non-SGML-standard mode which includes ;; all data until end-of-entity (end-of-buffer for us) ;; (for PLAINTEXT). (goto-char (point-max))) ((eq 'RCDATA content-model) ;; RCDATA: all data characters until end-tag is seen, ;; except that entities are expanded first, although the ;; expansions are _not_ scanned for end-tags, although the ;; expansions _are_ scanned for further entity ;; references. (while (progn (if (re-search-forward (if w3-p-d-null-end-tag-enabled "</[a-z>]\\|[/&]" "</[a-z>]\\|&") nil 'move) (goto-char (match-beginning 0))) (= ?& (following-char))) (w3-expand-entity-at-point-maybe))))))) (t ;; The element is illegal here. We'll just discard the start ;; tag as though we never saw it. )))) (setq w3-p-d-tag-name nil) (setq w3-p-d-end-tag-p nil) (setq net-tag-p nil) (setq tag-attributes nil) (setq tag-end nil))) ;; End of main while loop. ) ;; We have finished parsing the buffer! (if status-message-format (message "%sdone" (format status-message-format 100))) ;; *** For debugging, save the true parse tree. ;; *** Make this look inside *DOCUMENT. (setq w3-last-parse-tree (w3-element-content w3-p-d-current-element)) (w3-element-content w3-p-d-current-element) ))) (provide 'w3-parse) ;; Local variables: ;; indent-tabs-mode: nil ;; end: