Mercurial > hg > xemacs-beta
diff lisp/w3/w3-parse.el @ 80:1ce6082ce73f r20-0b90
Import from CVS: tag r20-0b90
author | cvs |
---|---|
date | Mon, 13 Aug 2007 09:06:37 +0200 |
parents | 131b0175ea99 |
children | 6a378aca36af |
line wrap: on
line diff
--- a/lisp/w3/w3-parse.el Mon Aug 13 09:05:44 2007 +0200 +++ b/lisp/w3/w3-parse.el Mon Aug 13 09:06:37 2007 +0200 @@ -1,4 +1,4 @@ -;; Created by: Joe Wells, jbw@csb.bu.edu +;; Created by: Joe Wells, jbw@cs.bu.edu ;; Created on: Sat Sep 30 17:25:40 1995 ;; Filename: w3-parse.el ;; Purpose: Parse HTML and/or SGML for Emacs W3 browser. @@ -17,8 +17,9 @@ ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License -;; along with this program; if not, write to the Free Software -;; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +;; along with GNU Emacs; see the file COPYING. If not, write to the +;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. ;; ;; On November 13, 1995, the license was available at ;; <URL:ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0>. It may still be @@ -57,6 +58,7 @@ ;; will not complain, these variables are defined with defvar. (require 'w3-vars) +(require 'mule-sysdp) (eval-when-compile (defconst w3-p-s-var-list nil @@ -116,11 +118,6 @@ "A stack of the currently open elements, with the innermost enclosing element on top and the outermost on bottom.") - (defvar w3-p-d-parse-tag-stream-tail-pointer) - (put 'w3-p-d-parse-tag-stream-tail-pointer 'variable-documentation - "Points to last cons cell in parse-tag stream. We add items to tail of -parse-tag-stream instead of head.") - (defvar w3-p-d-shortrefs) (put 'w3-p-d-shortrefs 'variable-documentation "An alist of the magic entity reference strings in the current @@ -289,15 +286,7 @@ (put (car (car html-entities)) 'html-entity-expansion (cons 'CDATA (if (integerp (cdr (car html-entities))) (char-to-string - (let ((c (cdr (car html-entities)))) - (cond - ((and (> c 127) (boundp 'MULE)) - (make-character lc-ltn1 c)) - ;;((and (> c 127) (featurep 'mule)) - ;; What??? - ;;) - (t - c)))) + (mule-make-iso-character (cdr (car html-entities)))) (cdr (car html-entities))))) (setq html-entities (cdr html-entities)))) @@ -441,18 +430,11 @@ ;; Bill wants to call w3-resolve-numeric-entity here, but I think ;; that functionality belongs in char-to-string. ;; The largest valid character in the I18N version of HTML is 65533. - ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt> + ;; ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt ;; wrongo! Apparently, mule doesn't do sane things with char-to-string ;; -wmp 7/9/96 (insert (char-to-string - (cond - ((and (boundp 'MULE) (> w3-p-s-num 127)) - (make-character lc-ltn1 w3-p-s-num)) - ;;((and (featurep 'mule) (> w3-p-s-num 127)) - ;;what?? - ;;) - (t - w3-p-s-num))))) + (mule-make-iso-character w3-p-s-num)))) ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r (replace-match (assq (upcase (char-after (+ 3 (point)))) '(;; *** Strictly speaking, record end should be @@ -785,15 +767,6 @@ (if w3-p-d-null-end-tag-enabled "/" "") (if w3-p-d-in-parsed-marked-section "]" "") (or w3-p-d-shortref-chars "")))) - - ;; Modifies free variable: - ;; w3-p-d-parse-tag-stream-tail-pointer - (defsubst w3-add-display-item (tag value) - (setcdr w3-p-d-parse-tag-stream-tail-pointer - (list (cons tag value))) - (setq w3-p-d-parse-tag-stream-tail-pointer - (cdr w3-p-d-parse-tag-stream-tail-pointer))) - ) (eval-when-compile @@ -808,13 +781,6 @@ ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var (defsubst w3-open-element (tag attributes) - ;; Send trailing data character item in the old current element to - ;; display engine. - (if (stringp (car-safe (w3-element-content w3-p-d-current-element))) - (w3-add-display-item - 'text - (car-safe (w3-element-content w3-p-d-current-element)))) - ;; Push new element on stack. (setq w3-p-d-open-element-stack (cons w3-p-d-current-element w3-p-d-open-element-stack)) @@ -874,14 +840,7 @@ w3-p-d-current-element (w3-element-content-model (car w3-p-d-open-element-stack)))) - ;; Send the start-tag and attributes to the display engine. - (if (memq tag '(plaintext style xmp textarea)) - ;; Garbage special-casing for old display engine. - ;; Nothing is sent until end-tag is found. - ;; The DTD will ensure no subelements of these elements. - nil - ;; Normal procedure. - (w3-add-display-item tag attributes))) + ) ) ;; The protocol for handing items to the display engine is as follows. @@ -960,33 +919,14 @@ (cond ((null w3-p-s-content)) ((equal "\n" (car w3-p-s-content)) (setq w3-p-s-content (cdr w3-p-s-content))) - ((and (stringp (car w3-p-s-content)) - ;; Garbage special-casing for old display engine. - (not (memq w3-p-s-end-tag - '(/plaintext /style /xmp /textarea)))) - (w3-add-display-item 'text (car w3-p-s-content)))) + ) - ;; Send the end-tag to the display engine, but only if the element is - ;; allowed to have an end tag. - (cond ((memq w3-p-s-end-tag '(/plaintext /style /xmp /textarea)) - ;; Garbage special-casing for old display engine. - ;; Format old display engine expects for these elements: - ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTRIBUTES)) - (w3-add-display-item - ;; Use the *start*-tag, not the end-tag. - (w3-element-name w3-p-d-current-element) - (cons (cons 'data - (condition-case nil - (mapconcat 'identity w3-p-s-content "") - (error "eeek! subelement content!"))) - (w3-element-attributes w3-p-d-current-element)))) - ;; *** Handle LISTING the way the old parser did. + (cond ;; *** Handle LISTING the way the old parser did. ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element)) ;; Do nothing, can't have an end tag. ) (t ;; Normal case. - (w3-add-display-item w3-p-s-end-tag nil) (if (null w3-p-s-content) (w3-debug-html :bad-style :outer @@ -1175,14 +1115,14 @@ (%imagemaps . (area map)) ;; special action is taken for %text inside %body.content in the ;; content model of each element. - (%body.content . (%heading %block hr div address %imagemaps)) + (%body.content . (%heading %block style hr div address %imagemaps)) (%heading . (h1 h2 h3 h4 h5 h6)) ;; Emacs-w3 extensions (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek)) - (%block . (p %list dl form %preformatted font + (%block . (p %list dl form %preformatted %blockquote isindex fn table fig note center %block-deprecated %block-obsoleted)) (%list . (ul ol)) @@ -1196,10 +1136,10 @@ (%text . (*data b %notmath sub sup %emacsw3-crud)) (%notmath . (%special %font %phrase %misc)) - (%font . (i u s strike tt big small sub sup + (%font . (i u s strike tt big small sub sup font roach secret wired)) ;; B left out for MATH (%phrase . (em strong dfn code samp kbd var cite blink)) - (%special . (a img applet font br script map math tab)) + (%special . (a img applet object font basefont br script style map math tab span bdo)) (%misc . (q lang au person acronym abbrev ins del)) (%formula . (*data %math)) @@ -1277,8 +1217,9 @@ (end-tag-omissible . t)) ;; SCRIPT - - (#PCDATA) ((script) - (content-model . CDATA ; not official, but allows - ; comment hiding of script + (content-model . XCDATA ; not official, but allows + ; comment hiding of script, and also + ; idiots that use '</' in scripts. )) ;; TITLE - - (#PCDATA) ((title) @@ -1327,6 +1268,12 @@ include-space ((%in-text-ignore)) nil)])) + ((span bdo) + (content-model . [((%text) + include-space + nil + nil)]) + ) ((p) (content-model . [((%text) include-space @@ -1456,7 +1403,7 @@ ((credit) *close)) nil)]) (end-tag-omissible . t)) - ((%emacsw3-crud) + ((%emacsw3-crud basefont) (content-model . EMPTY)) ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA) ((form) @@ -1716,8 +1663,9 @@ (content-model . EMPTY)) ;; ;; APPLET is a Java thing. + ;; OBJECT is a cougar thing ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README> - ((applet) + ((applet object) ;; I really don't want to add another ANY content-model. (content-model . XINHERIT) (inclusions . (param))) @@ -1749,6 +1697,8 @@ (w3-p-s-var-def w3-p-s-tran-list) (w3-p-s-var-def w3-p-s-content-model) (w3-p-s-var-def w3-p-s-except) + (w3-p-s-var-def w3-p-s-baseobject) + (w3-p-s-var-def w3-p-s-btdt) ;; Uses free variables: ;; w3-p-d-current-element, w3-p-d-exceptions ;; Destroys free variables: @@ -1765,7 +1715,7 @@ '(CDATA RCDATA XCDATA XXCDATA)) (memq tag-name '(*data *space))) ;; *** Implement ANY. - (error "impossible")) + (error "impossible content model lossage")) (setq w3-p-s-includep t) ;; Exit loop. nil) @@ -1945,7 +1895,7 @@ ;; content-model. t) (t - (error "impossible"))))))) + (error "impossible transition"))))))) ;; Empty while loop body. ) @@ -1985,12 +1935,10 @@ ;; % DO NOT call any of the other functions! % ;; % % ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -(defun w3-parse-buffer (&optional buff nodraw) +(defun w3-parse-buffer (&optional buff) "Parse contents of BUFF as HTML. BUFF defaults to the value of url-working-buffer. Destructively alters contents of BUFF. -Unless optional second argument NODRAW is non-nil, calls the display -engine on the parsed HTML. Returns a data structure containing the parsed information." (set-buffer (or buff url-working-buffer)) @@ -2011,10 +1959,6 @@ ;; *** Should premunge line boundaries. ;; ******************** - ;; Prepare another buffer to draw in unless told not to. - (if (not nodraw) - (w3-prepare-draw-buffer-for-parse-buffer)) - (let* ( ;; Speed hack, see the variable doc string. (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0) @@ -2039,20 +1983,6 @@ ;; Determine which we can use outside of the loop for speed. (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100)) - ;; Speed up checking whether to do incremental display. - (w3-do-incremental-display (if nodraw nil w3-do-incremental-display)) - - ;; Used to convert parse tree to tag stream that old display - ;; engine expects. Will change when display engine is rewritten. - (parse-tag-stream '(*dummy)) - - ;; See doc string. - (w3-p-d-parse-tag-stream-tail-pointer parse-tag-stream) - - ;; Points to cons cell in parse-tag-stream whose car is the last - ;; item that has been sent to display engine. - (parse-tag-stream-last-displayed-item parse-tag-stream) - ;; The buffer which contains the HTML we are parsing. This ;; variable is used to avoid using the more expensive ;; save-excursion. @@ -2138,9 +2068,8 @@ ) ;; Scratch variables used by macros and defsubsts we call. (w3-p-s-let-bindings - (w3-update-non-markup-chars) - + (setq w3-p-s-baseobject (url-generic-parse-url (url-view-url t))) ;; Main loop. Handle markup as follows: ;; ;; non-empty tag: Handle the region since the previous tag as PCDATA, @@ -2173,8 +2102,6 @@ ;; Display progress messages if asked and/or do incremental display ;; of results (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40)) - (if w3-do-incremental-display - (w3-pause)) (if status-message-format (message status-message-format ;; Percentage of buffer processed. @@ -2196,17 +2123,24 @@ ((looking-at "/?\\([a-z][-a-z0-9.]*\\)") ;; We are looking at a non-empty tag. - + + ;; Downcase it in the buffer, to save creation of a string + (downcase-region (match-beginning 1) (match-end 1)) (setq w3-p-d-tag-name - (intern (downcase (buffer-substring (match-beginning 1) - (match-end 1))))) + (intern (buffer-substring (match-beginning 1) + (match-end 1)))) (setq w3-p-d-end-tag-p (= ?/ (following-char))) (setq between-tags-end (1- (point))) (goto-char (match-end 0)) ;; Read the attributes from a start-tag. - (or - w3-p-d-end-tag-p + (if w3-p-d-end-tag-p + (if (looking-at "[ \t\r\n/]*>") + nil + ;; This is in here to deal with those idiots who stick + ;; attribute/value pairs on end tags. *sigh* + (w3-debug-html "Evil attributes on end tag.") + (skip-chars-forward "^>")) ;; Attribute values can be: ;; "STRING" where STRING does not contain the double quote @@ -2227,7 +2161,7 @@ "[ \n\r\t]*" ;; The attribute name, possibly with a bad syntax ;; component. - "\\([a-z][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)" + "\\([a-z_][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)" ;; Trailing whitespace and perhaps an "=". "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)"))) @@ -2237,10 +2171,12 @@ (format "Bad attribute name syntax: %s" (buffer-substring (match-beginning 1) (match-end 1)))))) - + + ;; Downcase it in the buffer, to save creation of a string + (downcase-region (match-beginning 1) (match-end 1)) (setq attr-name - (intern (downcase (buffer-substring (match-beginning 1) - (match-end 1))))) + (intern (buffer-substring (match-beginning 1) + (match-end 1)))) (goto-char (match-end 0)) (cond ((< (match-beginning 4) (match-end 4)) @@ -2253,7 +2189,7 @@ "\"\\([^\"]*\\)\"" "\\|" ;; Literal with single quotes. - "'\\([^']\\)*'" + "'\\([^']*\\)'" "\\|" ;; Handle bad HTML conflicting with NET-enabling ;; start-tags. @@ -2286,8 +2222,8 @@ (skip-chars-forward "^&") (not (eobp))) (w3-expand-entity-at-point-maybe)) - (subst-char-in-region (point-min) (point-max) ?\t 32) - (subst-char-in-region (point-min) (point-max) ?\n 32)) + (subst-char-in-region (point-min) (point-max) ?\t ? ) + (subst-char-in-region (point-min) (point-max) ?\n ? )) ;; Set this after we have changed the size of the ;; attribute. (setq attribute-value-end (1+ (point-max)))) @@ -2305,13 +2241,13 @@ (format "Evil attribute value syntax: %s" (buffer-substring (point-min) (point-max))))) (t - (error "impossible")))) + (error "impossible attribute value")))) ((memq (following-char) '(?\" ?')) ;; Missing terminating quote character. (narrow-to-region (point) (progn (forward-char 1) - (skip-chars-forward "^ \t\n\r'\"=<>") + (skip-chars-forward "^ \t\n\r'\"<>") (setq attribute-value-end (point)))) (w3-debug-html :nocontext (format "Attribute value missing end quote: %s" @@ -2322,7 +2258,7 @@ ;; make a best guess as to what the author intended. (narrow-to-region (point) (progn - (skip-chars-forward "^ \t\n\r'\"=<>") + (skip-chars-forward "^ \t\n\r'\"<>") (setq attribute-value-end (point)))) (w3-debug-html :nocontext (format "Bad attribute value syntax: %s" @@ -2339,6 +2275,23 @@ ;; * smash multiple space sequences into single spaces ;; * verify the syntax of each token (setq attr-value (buffer-substring (point-min) (point-max))) + (case attr-name + (class + (setq attr-value (split-string attr-value "[ ,]+"))) + (align + (if (string-match "^[ \t\r\n]*\\(.*\\)[ \t\r\n]*$" + attr-value) + (setq attr-value (downcase + (substring attr-value + (match-beginning 1) + (match-end 1)))) + (setq attr-value (downcase attr-value))) + (setq attr-value (intern attr-value))) + ((src href) + ;; I should expand URLs here + ) + (otherwise nil) + ) (widen) (goto-char attribute-value-end)) (t @@ -2352,7 +2305,31 @@ ;; Accumulate the attributes. (setq tag-attributes (cons (cons attr-name attr-value) - tag-attributes)))) + tag-attributes))) + + (cond + ((and (eq w3-p-d-tag-name 'base) + (setq w3-p-s-baseobject + (or (assq 'src tag-attributes) + (assq 'href tag-attributes)))) + (setq w3-p-s-baseobject (url-generic-parse-url + (cdr w3-p-s-baseobject)))) + ((setq w3-p-s-btdt (or (assq 'src tag-attributes) + (assq 'href tag-attributes) + (assq 'action tag-attributes))) + (setcdr w3-p-s-btdt (url-expand-file-name (cdr w3-p-s-btdt) + w3-p-s-baseobject)) + (setq w3-p-s-btdt (if (url-have-visited-url (cdr w3-p-s-btdt)) + ":visited" + ":link")) + (if (assq 'class tag-attributes) + (setcdr (assq 'class tag-attributes) + (cons w3-p-s-btdt + (cdr (assq 'class tag-attributes)))) + (setq tag-attributes (cons (cons 'class (list w3-p-s-btdt)) + tag-attributes)))) + ) + ) ;; Process the end of the tag. (skip-chars-forward " \t\n\r") @@ -2473,7 +2450,7 @@ (or (= ?\[ (following-char)) ;; I probably shouldn't even check this, since it is so ;; impossible. - (error "impossible")) + (error "impossible ??")) (forward-char 1) (delete-region (1- (match-beginning 0)) (point)) (cond ((eq 'IGNORE keyword) @@ -2625,26 +2602,12 @@ nil ;; We are definitely going to add data characters to the ;; content. - ;; Protocol is that all but last data character item - ;; must have been sent to display engine. - (and content - (stringp (car content)) - ;; Gross, disgusting hack to deal with old interface - ;; to display engine. Remove as soon as possible. - (not (memq (w3-element-name w3-p-d-current-element) - '(plaintext style xmp textarea))) - (w3-add-display-item 'text (car content))) (cond ((and (= ?\n (preceding-char)) (/= between-tags-start (1- (point)))) (setq content (cons (buffer-substring between-tags-start (1- (point))) content)) - ;; Gross, disgusting hack to deal with old interface - ;; to display engine. Remove as soon as possible. - (or (memq (w3-element-name w3-p-d-current-element) - '(plaintext style xmp textarea)) - (w3-add-display-item 'text (car content))) (setq content (cons "\n" content))) (t (setq content (cons (buffer-substring between-tags-start @@ -2774,72 +2737,22 @@ (setq tag-attributes nil) (setq tag-end nil))) - ;; Hand items to the display engine. - (cond ((not nodraw) - (set-buffer w3-draw-buffer) - (while (not (eq parse-tag-stream-last-displayed-item - w3-p-d-parse-tag-stream-tail-pointer)) - (setq parse-tag-stream-last-displayed-item - (cdr parse-tag-stream-last-displayed-item)) - ;; We call w3-handle-single-tag from only one spot so that it - ;; is reasonable to inline it, since it is a big function. - (w3-handle-single-tag - (car (car parse-tag-stream-last-displayed-item)) - (cdr (car parse-tag-stream-last-displayed-item)))) - (set-buffer parse-buffer))) - ;; End of main while loop. ) ;; We have finished parsing the buffer! (if status-message-format (message "%sdone" (format status-message-format 100))) - ;; Do this now so the user can see the full results before Emacs - ;; goes off and garbage-collects for an hour. :-( - (if w3-do-incremental-display - (w3-pause)) ;; *** For debugging, save the true parse tree. ;; *** Make this look inside *DOCUMENT. (setq w3-last-parse-tree (w3-element-content w3-p-d-current-element)) - - ;; Return the parse in the format expected, a stream of tags - ;; possibly with a buffer at the front. - (if nodraw - ;; Discard the *dummy item at start of list. - (cdr parse-tag-stream) - (cons w3-draw-buffer (cdr parse-tag-stream))) - + + (w3-element-content w3-p-d-current-element) ))) -;;; -;;; Initialization of display engine to accept parser output. -;;; - -(defun w3-prepare-draw-buffer-for-parse-buffer () - (setq list-buffers-directory nil) - (let ((buf (get-buffer-create (url-generate-new-buffer-name - "Untitled"))) - (info (mapcar (function (lambda (x) (cons x (symbol-value x)))) - w3-persistent-variables))) - (setq w3-draw-buffer buf) - (save-excursion - (set-window-buffer (selected-window) buf) - (set-buffer buf) - (setq w3-draw-buffer (current-buffer)) - (erase-buffer) - (buffer-disable-undo (current-buffer)) - (mapcar (function (lambda (x) (set (car x) (cdr x)))) info) - (setq w3-last-fill-pos (point)) - (setq fill-column (min (- (or w3-strict-width (window-width)) - w3-right-border) - (or w3-maximum-line-length (window-width)))) - (setq fill-prefix "") - (w3-init-state)))) - - (provide 'w3-parse)