comparison lisp/w3/w3-parse.el @ 70:131b0175ea99 r20-0b30

Import from CVS: tag r20-0b30
author cvs
date Mon, 13 Aug 2007 09:02:59 +0200
parents 1a767b41a199
children 1ce6082ce73f
comparison
equal deleted inserted replaced
69:804d1389bcd6 70:131b0175ea99
1 ;; Created by: Joe Wells, jbw@cs.bu.edu 1 ;; Created by: Joe Wells, jbw@csb.bu.edu
2 ;; Created on: Sat Sep 30 17:25:40 1995 2 ;; Created on: Sat Sep 30 17:25:40 1995
3 ;; Filename: w3-parse.el 3 ;; Filename: w3-parse.el
4 ;; Purpose: Parse HTML and/or SGML for Emacs W3 browser. 4 ;; Purpose: Parse HTML and/or SGML for Emacs W3 browser.
5 5
6 ;; Copyright © 1995, 1996, 1997 Joseph Brian Wells 6 ;; Copyright © 1995, 1996 Joseph Brian Wells
7 ;; Copyright © 1993, 1994, 1995 by William M. Perry (wmperry@cs.indiana.edu) 7 ;; Copyright © 1993, 1994, 1995 by William M. Perry (wmperry@cs.indiana.edu)
8 ;; 8 ;;
9 ;; This program is free software; you can redistribute it and/or modify 9 ;; This program is free software; you can redistribute it and/or modify
10 ;; it under the terms of the GNU General Public License as published by 10 ;; it under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation; either version 2 of the License, or 11 ;; the Free Software Foundation; either version 2 of the License, or
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of 15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details. 17 ;; GNU General Public License for more details.
18 ;; 18 ;;
19 ;; You should have received a copy of the GNU General Public License 19 ;; You should have received a copy of the GNU General Public License
20 ;; along with GNU Emacs; see the file COPYING. If not, write to the 20 ;; along with this program; if not, write to the Free Software
21 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, 21 ;; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 ;; Boston, MA 02111-1307, USA.
23 ;; 22 ;;
24 ;; On November 13, 1995, the license was available at 23 ;; On November 13, 1995, the license was available at
25 ;; <URL:ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0>. It may still be 24 ;; <URL:ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0>. It may still be
26 ;; obtainable via that URL. 25 ;; obtainable via that URL.
27 26
56 ;; (remember this is really all one function). Some of the code which 55 ;; (remember this is really all one function). Some of the code which
57 ;; updates them is located inside the subfunctions. So that the compiler 56 ;; updates them is located inside the subfunctions. So that the compiler
58 ;; will not complain, these variables are defined with defvar. 57 ;; will not complain, these variables are defined with defvar.
59 58
60 (require 'w3-vars) 59 (require 'w3-vars)
61 (require 'mule-sysdp)
62 60
63 (eval-when-compile 61 (eval-when-compile
64 (defconst w3-p-s-var-list nil 62 (defconst w3-p-s-var-list nil
65 "A list of the scratch variables used by functions called by 63 "A list of the scratch variables used by functions called by
66 w3-parse-buffer which it is w3-parse-buffer's responsibility to 64 w3-parse-buffer which it is w3-parse-buffer's responsibility to
115 113
116 (defvar w3-p-d-open-element-stack) 114 (defvar w3-p-d-open-element-stack)
117 (put 'w3-p-d-open-element-stack 'variable-documentation 115 (put 'w3-p-d-open-element-stack 'variable-documentation
118 "A stack of the currently open elements, with the innermost enclosing 116 "A stack of the currently open elements, with the innermost enclosing
119 element on top and the outermost on bottom.") 117 element on top and the outermost on bottom.")
118
119 (defvar w3-p-d-parse-tag-stream-tail-pointer)
120 (put 'w3-p-d-parse-tag-stream-tail-pointer 'variable-documentation
121 "Points to last cons cell in parse-tag stream. We add items to tail of
122 parse-tag-stream instead of head.")
120 123
121 (defvar w3-p-d-shortrefs) 124 (defvar w3-p-d-shortrefs)
122 (put 'w3-p-d-shortrefs 'variable-documentation 125 (put 'w3-p-d-shortrefs 'variable-documentation
123 "An alist of the magic entity reference strings in the current 126 "An alist of the magic entity reference strings in the current
124 between-tags region and their replacements. Each item is of the format 127 between-tags region and their replacements. Each item is of the format
262 (goto-char (point-min)) 265 (goto-char (point-min))
263 (insert "\"") 266 (insert "\"")
264 (while (progn 267 (while (progn
265 (skip-chars-forward "^\"\\\t\n\r") 268 (skip-chars-forward "^\"\\\t\n\r")
266 (not (eobp))) 269 (not (eobp)))
267 (insert "\\" (cdr (assq (char-after (point)) '((?\" . "\"") 270 (insert "\\" (cdr (assq (following-char) '((?\" . "\"")
268 (?\\ . "\\") 271 (?\\ . "\\")
269 (?\t . "t") 272 (?\t . "t")
270 (?\n . "n") 273 (?\n . "n")
271 (?\r . "r"))))) 274 (?\r . "r")))))
272 (delete-char 1)) 275 (delete-char 1))
273 (insert "\"") 276 (insert "\"")
274 (buffer-string))) 277 (buffer-string)))
275 278
276 279
284 (let ((html-entities w3-html-entities)) 287 (let ((html-entities w3-html-entities))
285 (while html-entities 288 (while html-entities
286 (put (car (car html-entities)) 'html-entity-expansion 289 (put (car (car html-entities)) 'html-entity-expansion
287 (cons 'CDATA (if (integerp (cdr (car html-entities))) 290 (cons 'CDATA (if (integerp (cdr (car html-entities)))
288 (char-to-string 291 (char-to-string
289 (mule-make-iso-character (cdr (car html-entities)))) 292 (let ((c (cdr (car html-entities))))
293 (cond
294 ((and (> c 127) (boundp 'MULE))
295 (make-character lc-ltn1 c))
296 ;;((and (> c 127) (featurep 'mule))
297 ;; What???
298 ;;)
299 (t
300 c))))
290 (cdr (car html-entities))))) 301 (cdr (car html-entities)))))
291 (setq html-entities (cdr html-entities)))) 302 (setq html-entities (cdr html-entities))))
292 303
293 ;; These are handled differently than the normal HTML entities because 304 ;; These are handled differently than the normal HTML entities because
294 ;; we need to define the entities with 'nil instead of 'CDATA so 305 ;; we need to define the entities with 'nil instead of 'CDATA so
428 ;; char-to-string will hopefully do something useful with characters 439 ;; char-to-string will hopefully do something useful with characters
429 ;; larger than 255. I think in MULE it does. Is this true? 440 ;; larger than 255. I think in MULE it does. Is this true?
430 ;; Bill wants to call w3-resolve-numeric-entity here, but I think 441 ;; Bill wants to call w3-resolve-numeric-entity here, but I think
431 ;; that functionality belongs in char-to-string. 442 ;; that functionality belongs in char-to-string.
432 ;; The largest valid character in the I18N version of HTML is 65533. 443 ;; The largest valid character in the I18N version of HTML is 65533.
433 ;; ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt 444 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt>
434 ;; wrongo! Apparently, mule doesn't do sane things with char-to-string 445 ;; wrongo! Apparently, mule doesn't do sane things with char-to-string
435 ;; -wmp 7/9/96 446 ;; -wmp 7/9/96
436 (insert (char-to-string 447 (insert (char-to-string
437 (mule-make-iso-character w3-p-s-num)))) 448 (cond
449 ((and (boundp 'MULE) (> w3-p-s-num 127))
450 (make-character lc-ltn1 w3-p-s-num))
451 ;;((and (featurep 'mule) (> w3-p-s-num 127))
452 ;;what??
453 ;;)
454 (t
455 w3-p-s-num)))))
438 ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r 456 ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r
439 (replace-match (assq (upcase (char-after (+ 3 (point)))) 457 (replace-match (assq (upcase (char-after (+ 3 (point))))
440 '(;; *** Strictly speaking, record end should be 458 '(;; *** Strictly speaking, record end should be
441 ;; carriage return. 459 ;; carriage return.
442 (?E . "\n") ; RE 460 (?E . "\n") ; RE
448 ;; doesn't get rescanned. 466 ;; doesn't get rescanned.
449 ;; *** Strictly speaking, we should issue a warning for &#foo; if foo 467 ;; *** Strictly speaking, we should issue a warning for &#foo; if foo
450 ;; is not a function character in the SGML declaration. 468 ;; is not a function character in the SGML declaration.
451 ) 469 )
452 470
453 ((eq ?& (char-after (point))) 471 ((eq ?& (following-char))
454 ;; We are either looking at an undefined reference or a & that does 472 ;; We are either looking at an undefined reference or a & that does
455 ;; not start a reference (in which case we should not have been called). 473 ;; not start a reference (in which case we should not have been called).
456 ;; Skip over the &. 474 ;; Skip over the &.
457 (forward-char 1)) 475 (forward-char 1))
458 476
765 (concat "^&<" 783 (concat "^&<"
766 (w3-invalid-sgml-chars) 784 (w3-invalid-sgml-chars)
767 (if w3-p-d-null-end-tag-enabled "/" "") 785 (if w3-p-d-null-end-tag-enabled "/" "")
768 (if w3-p-d-in-parsed-marked-section "]" "") 786 (if w3-p-d-in-parsed-marked-section "]" "")
769 (or w3-p-d-shortref-chars "")))) 787 (or w3-p-d-shortref-chars ""))))
788
789 ;; Modifies free variable:
790 ;; w3-p-d-parse-tag-stream-tail-pointer
791 (defsubst w3-add-display-item (tag value)
792 (setcdr w3-p-d-parse-tag-stream-tail-pointer
793 (list (cons tag value)))
794 (setq w3-p-d-parse-tag-stream-tail-pointer
795 (cdr w3-p-d-parse-tag-stream-tail-pointer)))
796
770 ) 797 )
771 798
772 (eval-when-compile 799 (eval-when-compile
773 (w3-p-s-var-def w3-p-s-overrides) 800 (w3-p-s-var-def w3-p-s-overrides)
774 (w3-p-s-var-def w3-p-s-undo-list) 801 (w3-p-s-var-def w3-p-s-undo-list)
779 ;; w3-p-d-current-element, w3-p-d-open-element-stack 806 ;; w3-p-d-current-element, w3-p-d-open-element-stack
780 ;; Destroys free variables: 807 ;; Destroys free variables:
781 ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var 808 ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var
782 (defsubst w3-open-element (tag attributes) 809 (defsubst w3-open-element (tag attributes)
783 810
811 ;; Send trailing data character item in the old current element to
812 ;; display engine.
813 (if (stringp (car-safe (w3-element-content w3-p-d-current-element)))
814 (w3-add-display-item
815 'text
816 (car-safe (w3-element-content w3-p-d-current-element))))
817
784 ;; Push new element on stack. 818 ;; Push new element on stack.
785 (setq w3-p-d-open-element-stack (cons w3-p-d-current-element 819 (setq w3-p-d-open-element-stack (cons w3-p-d-current-element
786 w3-p-d-open-element-stack)) 820 w3-p-d-open-element-stack))
787 (setq w3-p-d-current-element (w3-fresh-element-for-tag tag)) 821 (setq w3-p-d-current-element (w3-fresh-element-for-tag tag))
788 822
838 (if (eq 'XINHERIT (w3-element-content-model w3-p-d-current-element)) 872 (if (eq 'XINHERIT (w3-element-content-model w3-p-d-current-element))
839 (w3-set-element-content-model 873 (w3-set-element-content-model
840 w3-p-d-current-element 874 w3-p-d-current-element
841 (w3-element-content-model (car w3-p-d-open-element-stack)))) 875 (w3-element-content-model (car w3-p-d-open-element-stack))))
842 876
843 ) 877 ;; Send the start-tag and attributes to the display engine.
878 (if (memq tag '(plaintext style xmp textarea))
879 ;; Garbage special-casing for old display engine.
880 ;; Nothing is sent until end-tag is found.
881 ;; The DTD will ensure no subelements of these elements.
882 nil
883 ;; Normal procedure.
884 (w3-add-display-item tag attributes)))
844 ) 885 )
845 886
846 ;; The protocol for handing items to the display engine is as follows. 887 ;; The protocol for handing items to the display engine is as follows.
847 ;; 888 ;;
848 ;; For an element, send (START-TAG . ATTS), each member of the content, 889 ;; For an element, send (START-TAG . ATTS), each member of the content,
917 ;; any trailing data character item to display engine. 958 ;; any trailing data character item to display engine.
918 (setq w3-p-s-content (w3-element-content w3-p-d-current-element)) 959 (setq w3-p-s-content (w3-element-content w3-p-d-current-element))
919 (cond ((null w3-p-s-content)) 960 (cond ((null w3-p-s-content))
920 ((equal "\n" (car w3-p-s-content)) 961 ((equal "\n" (car w3-p-s-content))
921 (setq w3-p-s-content (cdr w3-p-s-content))) 962 (setq w3-p-s-content (cdr w3-p-s-content)))
922 ) 963 ((and (stringp (car w3-p-s-content))
964 ;; Garbage special-casing for old display engine.
965 (not (memq w3-p-s-end-tag
966 '(/plaintext /style /xmp /textarea))))
967 (w3-add-display-item 'text (car w3-p-s-content))))
923 968
924 (cond ;; *** Handle LISTING the way the old parser did. 969 ;; Send the end-tag to the display engine, but only if the element is
970 ;; allowed to have an end tag.
971 (cond ((memq w3-p-s-end-tag '(/plaintext /style /xmp /textarea))
972 ;; Garbage special-casing for old display engine.
973 ;; Format old display engine expects for these elements:
974 ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTRIBUTES))
975 (w3-add-display-item
976 ;; Use the *start*-tag, not the end-tag.
977 (w3-element-name w3-p-d-current-element)
978 (cons (cons 'data
979 (condition-case nil
980 (mapconcat 'identity w3-p-s-content "")
981 (error "eeek! subelement content!")))
982 (w3-element-attributes w3-p-d-current-element))))
983 ;; *** Handle LISTING the way the old parser did.
925 ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element)) 984 ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element))
926 ;; Do nothing, can't have an end tag. 985 ;; Do nothing, can't have an end tag.
927 ) 986 )
928 (t 987 (t
929 ;; Normal case. 988 ;; Normal case.
989 (w3-add-display-item w3-p-s-end-tag nil)
930 (if (null w3-p-s-content) 990 (if (null w3-p-s-content)
931 (w3-debug-html 991 (w3-debug-html
932 :bad-style :outer 992 :bad-style :outer
933 ;; Don't warn for empty TD elements or empty A elements 993 ;; Don't warn for empty TD elements or empty A elements
934 ;; with no HREF attribute. 994 ;; with no HREF attribute.
1111 (%headmisc . (script)) 1171 (%headmisc . (script))
1112 (%head-deprecated . (nextid)) 1172 (%head-deprecated . (nextid))
1113 1173
1114 ;; client-side imagemaps 1174 ;; client-side imagemaps
1115 (%imagemaps . (area map)) 1175 (%imagemaps . (area map))
1116 (%input.fields . (input select textarea keygen label))
1117 ;; special action is taken for %text inside %body.content in the 1176 ;; special action is taken for %text inside %body.content in the
1118 ;; content model of each element. 1177 ;; content model of each element.
1119 (%body.content . (%heading %block style hr div address %imagemaps)) 1178 (%body.content . (%heading %block hr div address %imagemaps))
1120 1179
1121 (%heading . (h1 h2 h3 h4 h5 h6)) 1180 (%heading . (h1 h2 h3 h4 h5 h6))
1122 1181
1123 ;; Emacs-w3 extensions 1182 ;; Emacs-w3 extensions
1124 (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek)) 1183 (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek))
1125 1184
1126 (%block . (p %list dl form %preformatted 1185 (%block . (p %list dl form %preformatted font
1127 %blockquote isindex fn table fig note 1186 %blockquote isindex fn table fig note
1128 multicol center %block-deprecated %block-obsoleted)) 1187 center %block-deprecated %block-obsoleted))
1129 (%list . (ul ol)) 1188 (%list . (ul ol))
1130 (%preformatted . (pre)) 1189 (%preformatted . (pre))
1131 (%blockquote . (bq)) 1190 (%blockquote . (bq))
1132 (%block-deprecated . (dir menu blockquote)) 1191 (%block-deprecated . (dir menu blockquote))
1133 (%block-obsoleted . (xmp listing)) 1192 (%block-obsoleted . (xmp listing))
1134 1193
1135 ;; Why is IMG in this list? 1194 ;; Why is IMG in this list?
1136 (%pre.exclusion . (*include img *discard tab math big small sub sup)) 1195 (%pre.exclusion . (*include img *discard tab math big small sub sup))
1137 1196
1138 (%text . (*data b %notmath sub sup %emacsw3-crud %input.fields)) 1197 (%text . (*data b %notmath sub sup %emacsw3-crud))
1139 (%notmath . (%special %font %phrase %misc)) 1198 (%notmath . (%special %font %phrase %misc))
1140 (%font . (i u s strike tt big small sub sup font 1199 (%font . (i u s strike tt big small sub sup
1141 roach secret wired)) ;; B left out for MATH 1200 roach secret wired)) ;; B left out for MATH
1142 (%phrase . (em strong dfn code samp kbd var cite blink)) 1201 (%phrase . (em strong dfn code samp kbd var cite blink))
1143 (%special . (a img applet object font basefont br script style map math tab span bdo)) 1202 (%special . (a img applet font br script map math tab))
1144 (%misc . (q lang au person acronym abbrev ins del)) 1203 (%misc . (q lang au person acronym abbrev ins del))
1145 1204
1146 (%formula . (*data %math)) 1205 (%formula . (*data %math))
1147 (%math . (box above below %mathvec root sqrt array sub sup 1206 (%math . (box above below %mathvec root sqrt array sub sup
1148 %mathface)) 1207 %mathface))
1216 ;; I haven't bothered to enumerate them. 1275 ;; I haven't bothered to enumerate them.
1217 (*close))]) 1276 (*close))])
1218 (end-tag-omissible . t)) 1277 (end-tag-omissible . t))
1219 ;; SCRIPT - - (#PCDATA) 1278 ;; SCRIPT - - (#PCDATA)
1220 ((script) 1279 ((script)
1221 (content-model . XCDATA ; not official, but allows 1280 (content-model . CDATA ; not official, but allows
1222 ; comment hiding of script, and also 1281 ; comment hiding of script
1223 ; idiots that use '</' in scripts.
1224 )) 1282 ))
1225 ;; TITLE - - (#PCDATA) 1283 ;; TITLE - - (#PCDATA)
1226 ((title) 1284 ((title)
1227 (content-model . RCDATA ; not official 1285 (content-model . RCDATA ; not official
1228 ;; [((*data) include-space nil nil)] 1286 ;; [((*data) include-space nil nil)]
1250 ;; caused by BQ's sharing of BODYTEXT. BQ 1308 ;; caused by BQ's sharing of BODYTEXT. BQ
1251 ;; should have its own BQTEXT. 1309 ;; should have its own BQTEXT.
1252 ((credit plaintext) *close)) 1310 ((credit plaintext) *close))
1253 nil)]) 1311 nil)])
1254 (end-tag-omissible . t)) 1312 (end-tag-omissible . t))
1255 ((div banner center multicol) 1313 ((div banner center)
1256 (content-model . [((%body.content) 1314 (content-model . [((%body.content)
1257 nil 1315 nil
1258 ;; Push <P> before data characters. Non-SGML. 1316 ;; Push <P> before data characters. Non-SGML.
1259 (((%text) p)) 1317 (((%text) p))
1260 nil)])) 1318 nil)]))
1267 ((%heading) 1325 ((%heading)
1268 (content-model . [((%text) 1326 (content-model . [((%text)
1269 include-space 1327 include-space
1270 ((%in-text-ignore)) 1328 ((%in-text-ignore))
1271 nil)])) 1329 nil)]))
1272 ((span bdo)
1273 (content-model . [((%text)
1274 include-space
1275 nil
1276 nil)])
1277 )
1278 ((p) 1330 ((p)
1279 (content-model . [((%text) 1331 (content-model . [((%text)
1280 include-space 1332 include-space
1281 nil 1333 nil
1282 ;; *** Should only close if tag can 1334 ;; *** Should only close if tag can
1402 ;; Push <P> before data characters. Very non-SGML. 1454 ;; Push <P> before data characters. Very non-SGML.
1403 (((%text) p) 1455 (((%text) p)
1404 ((credit) *close)) 1456 ((credit) *close))
1405 nil)]) 1457 nil)])
1406 (end-tag-omissible . t)) 1458 (end-tag-omissible . t))
1407 ((%emacsw3-crud basefont) 1459 ((%emacsw3-crud)
1408 (content-model . EMPTY)) 1460 (content-model . EMPTY))
1409 ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA) 1461 ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA)
1410 ((form) 1462 ((form)
1411 ;; Same as BODY. Ugh! 1463 ;; Same as BODY. Ugh!
1412 (content-model . [((%body.content) 1464 (content-model . [((%body.content)
1484 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-tables-03.txt> 1536 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-tables-03.txt>
1485 ((table) 1537 ((table)
1486 (content-model . [(nil 1538 (content-model . [(nil
1487 nil 1539 nil
1488 (((caption) *include *next) 1540 (((caption) *include *next)
1489 ((%text) tr *same error)
1490 ((col colgroup thead tfoot tbody tr) *retry *next)) 1541 ((col colgroup thead tfoot tbody tr) *retry *next))
1491 (*retry *next)) ;error handling 1542 (*retry *next)) ;error handling
1492 ((col colgroup) 1543 ((col colgroup)
1493 nil 1544 nil
1494 (((thead tfoot tbody tr) *retry *next)) 1545 (((thead tfoot tbody tr) *retry *next))
1528 ((tfoot tbody) 1579 ((tfoot tbody)
1529 (content-model . [((tr) 1580 (content-model . [((tr)
1530 nil 1581 nil
1531 (((tbody) *close) 1582 (((tbody) *close)
1532 ;; error handling 1583 ;; error handling
1533 ((td th) tr *same error)
1534 ((%body.content) tr *same error)) 1584 ((%body.content) tr *same error))
1535 nil)]) 1585 nil)])
1536 (end-tag-omissible . t)) 1586 (end-tag-omissible . t))
1537 ((tr) 1587 ((tr)
1538 (content-model . [((td th) 1588 (content-model . [((td th)
1539 nil 1589 nil
1540 (((tr tfoot tbody) *close) 1590 (((tr tfoot tbody) *close)
1541 ;; error handling 1591 ;; error handling
1542 ((%body.content %text) td *same error)) 1592 ((%body.content) td *same error))
1543 nil)]) 1593 nil)])
1544 (end-tag-omissible . t)) 1594 (end-tag-omissible . t))
1545 ((td th) 1595 ((td th)
1546 ;; Arrgh! Another %body.content!!! Stupid!!! 1596 ;; Arrgh! Another %body.content!!! Stupid!!!
1547 (content-model . [((%body.content) 1597 (content-model . [((%body.content)
1664 nil)])) 1714 nil)]))
1665 ((frame) 1715 ((frame)
1666 (content-model . EMPTY)) 1716 (content-model . EMPTY))
1667 ;; 1717 ;;
1668 ;; APPLET is a Java thing. 1718 ;; APPLET is a Java thing.
1669 ;; OBJECT is a cougar thing
1670 ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README> 1719 ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README>
1671 ((applet object) 1720 ((applet)
1672 ;; I really don't want to add another ANY content-model. 1721 ;; I really don't want to add another ANY content-model.
1673 (content-model . XINHERIT) 1722 (content-model . XINHERIT)
1674 (inclusions . (param))) 1723 (inclusions . (param)))
1675 ((param) 1724 ((param)
1676 (content-model . EMPTY)) 1725 (content-model . EMPTY))
1698 (w3-p-s-var-def w3-p-s-state-transitions) 1747 (w3-p-s-var-def w3-p-s-state-transitions)
1699 (w3-p-s-var-def w3-p-s-transition) 1748 (w3-p-s-var-def w3-p-s-transition)
1700 (w3-p-s-var-def w3-p-s-tran-list) 1749 (w3-p-s-var-def w3-p-s-tran-list)
1701 (w3-p-s-var-def w3-p-s-content-model) 1750 (w3-p-s-var-def w3-p-s-content-model)
1702 (w3-p-s-var-def w3-p-s-except) 1751 (w3-p-s-var-def w3-p-s-except)
1703 (w3-p-s-var-def w3-p-s-baseobject)
1704 (w3-p-s-var-def w3-p-s-btdt)
1705 ;; Uses free variables: 1752 ;; Uses free variables:
1706 ;; w3-p-d-current-element, w3-p-d-exceptions 1753 ;; w3-p-d-current-element, w3-p-d-exceptions
1707 ;; Destroys free variables: 1754 ;; Destroys free variables:
1708 ;; w3-p-s-includep, w3-p-s-state-transitions, w3-p-s-transition, 1755 ;; w3-p-s-includep, w3-p-s-state-transitions, w3-p-s-transition,
1709 ;; w3-p-s-tran-list, w3-p-s-content-model, w3-p-s-except 1756 ;; w3-p-s-tran-list, w3-p-s-content-model, w3-p-s-except
1716 (w3-element-content-model w3-p-d-current-element))) 1763 (w3-element-content-model w3-p-d-current-element)))
1717 (or (and (memq w3-p-s-content-model 1764 (or (and (memq w3-p-s-content-model
1718 '(CDATA RCDATA XCDATA XXCDATA)) 1765 '(CDATA RCDATA XCDATA XXCDATA))
1719 (memq tag-name '(*data *space))) 1766 (memq tag-name '(*data *space)))
1720 ;; *** Implement ANY. 1767 ;; *** Implement ANY.
1721 (error "impossible content model lossage")) 1768 (error "impossible"))
1722 (setq w3-p-s-includep t) 1769 (setq w3-p-s-includep t)
1723 ;; Exit loop. 1770 ;; Exit loop.
1724 nil) 1771 nil)
1725 (t 1772 (t
1726 ;; We have a complex content model. 1773 ;; We have a complex content model.
1896 (w3-open-element (car w3-p-s-transition) nil) 1943 (w3-open-element (car w3-p-s-transition) nil)
1897 ;; Now we loop and try again in the new element's 1944 ;; Now we loop and try again in the new element's
1898 ;; content-model. 1945 ;; content-model.
1899 t) 1946 t)
1900 (t 1947 (t
1901 (error "impossible transition"))))))) 1948 (error "impossible")))))))
1902 1949
1903 ;; Empty while loop body. 1950 ;; Empty while loop body.
1904 ) 1951 )
1905 1952
1906 ;; Return value to user indicating whether to include or discard item: 1953 ;; Return value to user indicating whether to include or discard item:
1936 ;; % % 1983 ;; % %
1937 ;; % This is the *ONLY* valid entry point in this file! % 1984 ;; % This is the *ONLY* valid entry point in this file! %
1938 ;; % DO NOT call any of the other functions! % 1985 ;; % DO NOT call any of the other functions! %
1939 ;; % % 1986 ;; % %
1940 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1987 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1941 (defun w3-parse-buffer (&optional buff) 1988 (defun w3-parse-buffer (&optional buff nodraw)
1942 "Parse contents of BUFF as HTML. 1989 "Parse contents of BUFF as HTML.
1943 BUFF defaults to the value of url-working-buffer. 1990 BUFF defaults to the value of url-working-buffer.
1944 Destructively alters contents of BUFF. 1991 Destructively alters contents of BUFF.
1992 Unless optional second argument NODRAW is non-nil, calls the display
1993 engine on the parsed HTML.
1945 Returns a data structure containing the parsed information." 1994 Returns a data structure containing the parsed information."
1946 1995
1947 (set-buffer (or buff url-working-buffer)) 1996 (set-buffer (or buff url-working-buffer))
1948 (setq buff (current-buffer)) 1997 (setq buff (current-buffer))
1949 (set-syntax-table w3-sgml-md-syntax-table) 1998 (set-syntax-table w3-sgml-md-syntax-table)
1960 (goto-char (point-min)) 2009 (goto-char (point-min))
1961 2010
1962 ;; *** Should premunge line boundaries. 2011 ;; *** Should premunge line boundaries.
1963 ;; ******************** 2012 ;; ********************
1964 2013
2014 ;; Prepare another buffer to draw in unless told not to.
2015 (if (not nodraw)
2016 (w3-prepare-draw-buffer-for-parse-buffer))
2017
1965 (let* ( 2018 (let* (
1966 ;; Speed hack, see the variable doc string. 2019 ;; Speed hack, see the variable doc string.
1967 (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0) 2020 (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0)
1968 (* w3-gc-cons-threshold-multiplier 2021 (* w3-gc-cons-threshold-multiplier
1969 gc-cons-threshold) 2022 gc-cons-threshold)
1983 (format "Parsed %%3d%%%% of %d..." (- (point-max) (point-min))))) 2036 (format "Parsed %%3d%%%% of %d..." (- (point-max) (point-min)))))
1984 2037
1985 ;; Use a float value for 100 if possible, otherwise integer. 2038 ;; Use a float value for 100 if possible, otherwise integer.
1986 ;; Determine which we can use outside of the loop for speed. 2039 ;; Determine which we can use outside of the loop for speed.
1987 (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100)) 2040 (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100))
2041
2042 ;; Speed up checking whether to do incremental display.
2043 (w3-do-incremental-display (if nodraw nil w3-do-incremental-display))
2044
2045 ;; Used to convert parse tree to tag stream that old display
2046 ;; engine expects. Will change when display engine is rewritten.
2047 (parse-tag-stream '(*dummy))
2048
2049 ;; See doc string.
2050 (w3-p-d-parse-tag-stream-tail-pointer parse-tag-stream)
2051
2052 ;; Points to cons cell in parse-tag-stream whose car is the last
2053 ;; item that has been sent to display engine.
2054 (parse-tag-stream-last-displayed-item parse-tag-stream)
1988 2055
1989 ;; The buffer which contains the HTML we are parsing. This 2056 ;; The buffer which contains the HTML we are parsing. This
1990 ;; variable is used to avoid using the more expensive 2057 ;; variable is used to avoid using the more expensive
1991 ;; save-excursion. 2058 ;; save-excursion.
1992 (parse-buffer (current-buffer)) 2059 (parse-buffer (current-buffer))
2069 ;; Scratch variables used in this function 2136 ;; Scratch variables used in this function
2070 ref attr-name attr-value content-model content open-list 2137 ref attr-name attr-value content-model content open-list
2071 ) 2138 )
2072 ;; Scratch variables used by macros and defsubsts we call. 2139 ;; Scratch variables used by macros and defsubsts we call.
2073 (w3-p-s-let-bindings 2140 (w3-p-s-let-bindings
2141
2074 (w3-update-non-markup-chars) 2142 (w3-update-non-markup-chars)
2075 (setq w3-p-s-baseobject (url-generic-parse-url (url-view-url t))) 2143
2076 ;; Main loop. Handle markup as follows: 2144 ;; Main loop. Handle markup as follows:
2077 ;; 2145 ;;
2078 ;; non-empty tag: Handle the region since the previous tag as PCDATA, 2146 ;; non-empty tag: Handle the region since the previous tag as PCDATA,
2079 ;; RCDATA, CDATA, if allowed by syntax. Then handle the tag. 2147 ;; RCDATA, CDATA, if allowed by syntax. Then handle the tag.
2080 ;; 2148 ;;
2103 (setq last-loop-start (point))) 2171 (setq last-loop-start (point)))
2104 2172
2105 ;; Display progress messages if asked and/or do incremental display 2173 ;; Display progress messages if asked and/or do incremental display
2106 ;; of results 2174 ;; of results
2107 (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40)) 2175 (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40))
2176 (if w3-do-incremental-display
2177 (w3-pause))
2108 (if status-message-format 2178 (if status-message-format
2109 (message status-message-format 2179 (message status-message-format
2110 ;; Percentage of buffer processed. 2180 ;; Percentage of buffer processed.
2111 (/ (* (point) one-hundred) (point-max)))))) 2181 (/ (* (point) one-hundred) (point-max))))))
2112 2182
2115 2185
2116 ;; We are looking at a markup-starting character, and invalid 2186 ;; We are looking at a markup-starting character, and invalid
2117 ;; character, or end of buffer. 2187 ;; character, or end of buffer.
2118 (cond 2188 (cond
2119 2189
2120 ((eq ?< (char-after (point))) 2190 ((= ?< (following-char))
2121 2191
2122 ;; We are looking at a tag, comment, markup declaration, SGML marked 2192 ;; We are looking at a tag, comment, markup declaration, SGML marked
2123 ;; section, SGML processing instruction, or non-markup "<". 2193 ;; section, SGML processing instruction, or non-markup "<".
2124 (forward-char) 2194 (forward-char)
2125 (cond 2195 (cond
2126 2196
2127 ((looking-at "/?\\([a-z][-a-z0-9.]*\\)") 2197 ((looking-at "/?\\([a-z][-a-z0-9.]*\\)")
2128 ;; We are looking at a non-empty tag. 2198 ;; We are looking at a non-empty tag.
2129 2199
2130 ;; Downcase it in the buffer, to save creation of a string
2131 (downcase-region (match-beginning 1) (match-end 1))
2132 (setq w3-p-d-tag-name 2200 (setq w3-p-d-tag-name
2133 (intern (buffer-substring (match-beginning 1) 2201 (intern (downcase (buffer-substring (match-beginning 1)
2134 (match-end 1)))) 2202 (match-end 1)))))
2135 (setq w3-p-d-end-tag-p (eq ?/ (char-after (point))) 2203 (setq w3-p-d-end-tag-p (= ?/ (following-char)))
2136 between-tags-end (1- (point))) 2204 (setq between-tags-end (1- (point)))
2137 (goto-char (match-end 0)) 2205 (goto-char (match-end 0))
2138 2206
2139 ;; Read the attributes from a start-tag. 2207 ;; Read the attributes from a start-tag.
2140 (if w3-p-d-end-tag-p 2208 (or
2141 (if (looking-at "[ \t\r\n/]*[<>]") 2209 w3-p-d-end-tag-p
2142 nil
2143 ;; This is in here to deal with those idiots who stick
2144 ;; attribute/value pairs on end tags. *sigh*
2145 (w3-debug-html "Evil attributes on end tag.")
2146 (skip-chars-forward "^>"))
2147 2210
2148 ;; Attribute values can be: 2211 ;; Attribute values can be:
2149 ;; "STRING" where STRING does not contain the double quote 2212 ;; "STRING" where STRING does not contain the double quote
2150 ;; 'STRING' where STRING does not contain the single quote 2213 ;; 'STRING' where STRING does not contain the single quote
2151 ;; name-start character, *name character 2214 ;; name-start character, *name character
2162 (concat 2225 (concat
2163 ;; Leading whitespace. 2226 ;; Leading whitespace.
2164 "[ \n\r\t]*" 2227 "[ \n\r\t]*"
2165 ;; The attribute name, possibly with a bad syntax 2228 ;; The attribute name, possibly with a bad syntax
2166 ;; component. 2229 ;; component.
2167 "\\([a-z_][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)" 2230 "\\([a-z][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)"
2168 ;; Trailing whitespace and perhaps an "=". 2231 ;; Trailing whitespace and perhaps an "=".
2169 "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)"))) 2232 "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)")))
2170 2233
2171 (cond ((/= (match-beginning 2) (match-end 2)) 2234 (cond ((/= (match-beginning 2) (match-end 2))
2172 (w3-debug-html 2235 (w3-debug-html
2173 :nocontext 2236 :nocontext
2174 (format "Bad attribute name syntax: %s" 2237 (format "Bad attribute name syntax: %s"
2175 (buffer-substring (match-beginning 1) 2238 (buffer-substring (match-beginning 1)
2176 (match-end 1)))))) 2239 (match-end 1))))))
2177 2240
2178 ;; Downcase it in the buffer, to save creation of a string
2179 (downcase-region (match-beginning 1) (match-end 1))
2180 (setq attr-name 2241 (setq attr-name
2181 (intern (buffer-substring (match-beginning 1) 2242 (intern (downcase (buffer-substring (match-beginning 1)
2182 (match-end 1)))) 2243 (match-end 1)))))
2183 (goto-char (match-end 0)) 2244 (goto-char (match-end 0))
2184 (cond 2245 (cond
2185 ((< (match-beginning 4) (match-end 4)) 2246 ((< (match-beginning 4) (match-end 4))
2186 ;; A value was specified (e.g. ATTRIBUTE=VALUE). 2247 ;; A value was specified (e.g. ATTRIBUTE=VALUE).
2187 (cond 2248 (cond
2190 (concat 2251 (concat
2191 ;; Literal with double quotes. 2252 ;; Literal with double quotes.
2192 "\"\\([^\"]*\\)\"" 2253 "\"\\([^\"]*\\)\""
2193 "\\|" 2254 "\\|"
2194 ;; Literal with single quotes. 2255 ;; Literal with single quotes.
2195 "'\\([^']*\\)'" 2256 "'\\([^']\\)*'"
2196 "\\|" 2257 "\\|"
2197 ;; Handle bad HTML conflicting with NET-enabling 2258 ;; Handle bad HTML conflicting with NET-enabling
2198 ;; start-tags. 2259 ;; start-tags.
2199 "\\([-a-z0-9.]+/[-a-z0-9._/#]+\\)[ \t\n\r>]" 2260 "\\([-a-z0-9.]+/[-a-z0-9._/#]+\\)[ \t\n\r>]"
2200 "\\|" 2261 "\\|"
2223 (goto-char (point-min)) 2284 (goto-char (point-min))
2224 (while (progn 2285 (while (progn
2225 (skip-chars-forward "^&") 2286 (skip-chars-forward "^&")
2226 (not (eobp))) 2287 (not (eobp)))
2227 (w3-expand-entity-at-point-maybe)) 2288 (w3-expand-entity-at-point-maybe))
2228 (subst-char-in-region (point-min) (point-max) ?\t ? ) 2289 (subst-char-in-region (point-min) (point-max) ?\t 32)
2229 (subst-char-in-region (point-min) (point-max) ?\n ? )) 2290 (subst-char-in-region (point-min) (point-max) ?\n 32))
2230 ;; Set this after we have changed the size of the 2291 ;; Set this after we have changed the size of the
2231 ;; attribute. 2292 ;; attribute.
2232 (setq attribute-value-end (1+ (point-max)))) 2293 (setq attribute-value-end (1+ (point-max))))
2233 ((match-beginning 4) 2294 ((match-beginning 4)
2234 (setq attribute-value-end (match-end 4)) 2295 (setq attribute-value-end (match-end 4))
2242 (match-end 3))) 2303 (match-end 3)))
2243 (w3-debug-html :nocontext 2304 (w3-debug-html :nocontext
2244 (format "Evil attribute value syntax: %s" 2305 (format "Evil attribute value syntax: %s"
2245 (buffer-substring (point-min) (point-max))))) 2306 (buffer-substring (point-min) (point-max)))))
2246 (t 2307 (t
2247 (error "impossible attribute value")))) 2308 (error "impossible"))))
2248 ((memq (char-after (point)) '(?\" ?')) 2309 ((memq (following-char) '(?\" ?'))
2249 ;; Missing terminating quote character. 2310 ;; Missing terminating quote character.
2250 (narrow-to-region (point) 2311 (narrow-to-region (point)
2251 (progn 2312 (progn
2252 (forward-char 1) 2313 (forward-char 1)
2253 (skip-chars-forward "^ \t\n\r'\"<>") 2314 (skip-chars-forward "^ \t\n\r'\"=<>")
2254 (setq attribute-value-end (point)))) 2315 (setq attribute-value-end (point))))
2255 (w3-debug-html :nocontext 2316 (w3-debug-html :nocontext
2256 (format "Attribute value missing end quote: %s" 2317 (format "Attribute value missing end quote: %s"
2257 (buffer-substring (point-min) (point-max)))) 2318 (buffer-substring (point-min) (point-max))))
2258 (narrow-to-region (1+ (point-min)) (point-max))) 2319 (narrow-to-region (1+ (point-min)) (point-max)))
2259 (t 2320 (t
2260 ;; We have a syntactically invalid attribute value. Let's 2321 ;; We have a syntactically invalid attribute value. Let's
2261 ;; make a best guess as to what the author intended. 2322 ;; make a best guess as to what the author intended.
2262 (narrow-to-region (point) 2323 (narrow-to-region (point)
2263 (progn 2324 (progn
2264 (skip-chars-forward "^ \t\n\r'\"<>") 2325 (skip-chars-forward "^ \t\n\r'\"=<>")
2265 (setq attribute-value-end (point)))) 2326 (setq attribute-value-end (point))))
2266 (w3-debug-html :nocontext 2327 (w3-debug-html :nocontext
2267 (format "Bad attribute value syntax: %s" 2328 (format "Bad attribute value syntax: %s"
2268 (buffer-substring (point-min) (point-max)))))) 2329 (buffer-substring (point-min) (point-max))))))
2269 ;; Now we have isolated the attribute value. We need to 2330 ;; Now we have isolated the attribute value. We need to
2276 ;; * smash case 2337 ;; * smash case
2277 ;; * remove leading/trailing whitespace 2338 ;; * remove leading/trailing whitespace
2278 ;; * smash multiple space sequences into single spaces 2339 ;; * smash multiple space sequences into single spaces
2279 ;; * verify the syntax of each token 2340 ;; * verify the syntax of each token
2280 (setq attr-value (buffer-substring (point-min) (point-max))) 2341 (setq attr-value (buffer-substring (point-min) (point-max)))
2281 (case attr-name
2282 (class
2283 (setq attr-value (split-string attr-value "[ ,]+")))
2284 (align
2285 (if (string-match "^[ \t\r\n]*\\(.*\\)[ \t\r\n]*$"
2286 attr-value)
2287 (setq attr-value (downcase
2288 (substring attr-value
2289 (match-beginning 1)
2290 (match-end 1))))
2291 (setq attr-value (downcase attr-value)))
2292 (setq attr-value (intern attr-value)))
2293 ((src href)
2294 ;; I should expand URLs here
2295 )
2296 (otherwise nil)
2297 )
2298 (widen) 2342 (widen)
2299 (goto-char attribute-value-end)) 2343 (goto-char attribute-value-end))
2300 (t 2344 (t
2301 ;; No value was specified, in which case NAME should be 2345 ;; No value was specified, in which case NAME should be
2302 ;; taken as ATTRIBUTE=NAME where NAME is one of the 2346 ;; taken as ATTRIBUTE=NAME where NAME is one of the
2306 ;; is wrong. 2350 ;; is wrong.
2307 (setq attr-value (symbol-name attr-name)))) 2351 (setq attr-value (symbol-name attr-name))))
2308 2352
2309 ;; Accumulate the attributes. 2353 ;; Accumulate the attributes.
2310 (setq tag-attributes (cons (cons attr-name attr-value) 2354 (setq tag-attributes (cons (cons attr-name attr-value)
2311 tag-attributes))) 2355 tag-attributes))))
2312
2313 (cond
2314 ((and (eq w3-p-d-tag-name 'base)
2315 (setq w3-p-s-baseobject
2316 (or (assq 'src tag-attributes)
2317 (assq 'href tag-attributes))))
2318 (setq w3-p-s-baseobject (url-generic-parse-url
2319 (cdr w3-p-s-baseobject))))
2320 ((setq w3-p-s-btdt (or (assq 'src tag-attributes)
2321 (assq 'href tag-attributes)
2322 (assq 'action tag-attributes)))
2323 (setcdr w3-p-s-btdt (url-expand-file-name (cdr w3-p-s-btdt)
2324 w3-p-s-baseobject))
2325 (setq w3-p-s-btdt (if (url-have-visited-url (cdr w3-p-s-btdt))
2326 ":visited"
2327 ":link"))
2328 (if (assq 'class tag-attributes)
2329 (setcdr (assq 'class tag-attributes)
2330 (cons w3-p-s-btdt
2331 (cdr (assq 'class tag-attributes))))
2332 (setq tag-attributes (cons (cons 'class (list w3-p-s-btdt))
2333 tag-attributes))))
2334 )
2335 (if (not (eq w3-p-d-tag-name 'input))
2336 nil
2337 (setq w3-p-s-btdt (concat ":"
2338 (downcase
2339 (or (cdr-safe
2340 (assq 'type tag-attributes))
2341 "text"))))
2342 (if (assq 'class tag-attributes)
2343 (setcdr (assq 'class tag-attributes)
2344 (cons w3-p-s-btdt
2345 (cdr (assq 'class tag-attributes))))
2346 (setq tag-attributes (cons (cons 'class (list w3-p-s-btdt))
2347 tag-attributes))))
2348 )
2349 2356
2350 ;; Process the end of the tag. 2357 ;; Process the end of the tag.
2351 (skip-chars-forward " \t\n\r") 2358 (skip-chars-forward " \t\n\r")
2352 (cond ((eq ?> (char-after (point))) 2359 (cond ((= ?> (following-char))
2353 ;; Ordinary tag end. 2360 ;; Ordinary tag end.
2354 (forward-char 1)) 2361 (forward-char 1))
2355 ((and (eq ?/ (char-after (point))) 2362 ((and (= ?/ (following-char))
2356 (not w3-p-d-end-tag-p)) 2363 (not w3-p-d-end-tag-p))
2357 ;; This is a NET-enabling start-tag. 2364 ;; This is a NET-enabling start-tag.
2358 (setq net-tag-p t) 2365 (setq net-tag-p t)
2359 (forward-char 1)) 2366 (forward-char 1))
2360 ((eq ?< (char-after (point))) 2367 ((= ?< (following-char))
2361 ;; *** Strictly speaking, the following text has to 2368 ;; *** Strictly speaking, the following text has to
2362 ;; lexically be STAGO or ETAGO, which means that it 2369 ;; lexically be STAGO or ETAGO, which means that it
2363 ;; can't match some other lexical unit. 2370 ;; can't match some other lexical unit.
2364 ;; Unclosed tag. 2371 ;; Unclosed tag.
2365 nil) 2372 nil)
2372 2379
2373 (setq tag-end (point))) 2380 (setq tag-end (point)))
2374 2381
2375 ((looking-at "/?>") 2382 ((looking-at "/?>")
2376 ;; We are looking at an empty tag (<>, </>). 2383 ;; We are looking at an empty tag (<>, </>).
2377 (setq w3-p-d-end-tag-p (eq ?/ (char-after (point)))) 2384 (setq w3-p-d-end-tag-p (= ?/ (following-char)))
2378 (setq w3-p-d-tag-name (if w3-p-d-end-tag-p 2385 (setq w3-p-d-tag-name (if w3-p-d-end-tag-p
2379 (w3-element-name w3-p-d-current-element) 2386 (w3-element-name w3-p-d-current-element)
2380 ;; *** Strictly speaking, if OMITTAG NO, then 2387 ;; *** Strictly speaking, if OMITTAG NO, then
2381 ;; we should use the most recently closed tag. 2388 ;; we should use the most recently closed tag.
2382 ;; But OMITTAG YES in HTML and I'm lazy. 2389 ;; But OMITTAG YES in HTML and I'm lazy.
2434 ;; parentheses somewhere inside the declaration. Handling 2441 ;; parentheses somewhere inside the declaration. Handling
2435 ;; this properly would require full parsing of markup 2442 ;; this properly would require full parsing of markup
2436 ;; declarations, a goal for the future. 2443 ;; declarations, a goal for the future.
2437 (w3-debug-html "Bad <! syntax.") 2444 (w3-debug-html "Bad <! syntax.")
2438 (skip-chars-forward "^>") 2445 (skip-chars-forward "^>")
2439 (if (eq ?> (char-after (point))) 2446 (if (= ?> (following-char))
2440 (forward-char)))) 2447 (forward-char))))
2441 (point)))) 2448 (point))))
2442 2449
2443 ((looking-at "!\\\[\\(\\([ \t\n\r]*[a-z]+\\)+[ \t\n\r]*\\)\\\[") 2450 ((looking-at "!\\\[\\(\\([ \t\n\r]*[a-z]+\\)+[ \t\n\r]*\\)\\\[")
2444 ;; We are looking at a marked section. 2451 ;; We are looking at a marked section.
2461 (keyword (car-safe (cond ((memq 'IGNORE keywords)) 2468 (keyword (car-safe (cond ((memq 'IGNORE keywords))
2462 ((memq 'CDATA keywords)) 2469 ((memq 'CDATA keywords))
2463 ((memq 'RCDATA keywords)) 2470 ((memq 'RCDATA keywords))
2464 ((memq 'INCLUDE keywords)) 2471 ((memq 'INCLUDE keywords))
2465 ((memq 'TEMP keywords)))))) 2472 ((memq 'TEMP keywords))))))
2466 (or (eq ?\[ (char-after (point))) 2473 (or (= ?\[ (following-char))
2467 ;; I probably shouldn't even check this, since it is so 2474 ;; I probably shouldn't even check this, since it is so
2468 ;; impossible. 2475 ;; impossible.
2469 (error "impossible ??")) 2476 (error "impossible"))
2470 (forward-char 1) 2477 (forward-char 1)
2471 (delete-region (1- (match-beginning 0)) (point)) 2478 (delete-region (1- (match-beginning 0)) (point))
2472 (cond ((eq 'IGNORE keyword) 2479 (cond ((eq 'IGNORE keyword)
2473 ;; Scan forward skipping over matching <![ ... ]]> 2480 ;; Scan forward skipping over matching <![ ... ]]>
2474 ;; until we find an unmatched "]]>". 2481 ;; until we find an unmatched "]]>".
2502 (backward-char 1) 2509 (backward-char 1)
2503 (delete-region 2510 (delete-region
2504 (point) 2511 (point)
2505 (progn 2512 (progn
2506 (skip-chars-forward "^>") 2513 (skip-chars-forward "^>")
2507 (if (eq ?> (char-after (point))) 2514 (if (= ?> (following-char))
2508 (forward-char)) 2515 (forward-char))
2509 (point)))) 2516 (point))))
2510 (t 2517 (t
2511 ;; This < is not a markup character. Pretend we didn't notice 2518 ;; This < is not a markup character. Pretend we didn't notice
2512 ;; it at all. We have skipped over the < already, so just loop 2519 ;; it at all. We have skipped over the < already, so just loop
2513 ;; again. 2520 ;; again.
2514 ))) 2521 )))
2515 2522
2516 ((eq ?& (char-after (point))) 2523 ((= ?& (following-char))
2517 (w3-expand-entity-at-point-maybe)) 2524 (w3-expand-entity-at-point-maybe))
2518 2525
2519 ((and (eq ?\] (char-after (point))) 2526 ((and (= ?\] (following-char))
2520 w3-p-d-in-parsed-marked-section 2527 w3-p-d-in-parsed-marked-section
2521 (looking-at "]]>")) 2528 (looking-at "]]>"))
2522 ;; *** handle the end of a parsed marked section. 2529 ;; *** handle the end of a parsed marked section.
2523 (error "***unimplemented***")) 2530 (error "***unimplemented***"))
2524 2531
2525 ((and (eq ?/ (char-after (point))) 2532 ((and (= ?/ (following-char))
2526 w3-p-d-null-end-tag-enabled) 2533 w3-p-d-null-end-tag-enabled)
2527 ;; We are looking at a null end tag. 2534 ;; We are looking at a null end tag.
2528 (setq w3-p-d-end-tag-p t) 2535 (setq w3-p-d-end-tag-p t)
2529 (setq between-tags-end (point)) 2536 (setq between-tags-end (point))
2530 (setq tag-end (1+ (point))) 2537 (setq tag-end (1+ (point)))
2553 (goto-char pt))) 2560 (goto-char pt)))
2554 2561
2555 ((looking-at (eval-when-compile 2562 ((looking-at (eval-when-compile
2556 (concat "[" (w3-invalid-sgml-chars) "]"))) 2563 (concat "[" (w3-invalid-sgml-chars) "]")))
2557 (w3-debug-html 2564 (w3-debug-html
2558 (format "Invalid SGML character: %c" (char-after (point)))) 2565 (format "Invalid SGML character: %c" (following-char)))
2559 (insert (or (cdr-safe (assq (char-after (point)) 2566 (insert (or (cdr-safe (assq (following-char)
2560 ;; These characters are apparently 2567 ;; These characters are apparently
2561 ;; from a Windows character set. 2568 ;; from a Windows character set.
2562 '((146 . "'") 2569 '((146 . "'")
2563 (153 . "TM")))) 2570 (153 . "TM"))))
2564 "")) 2571 ""))
2609 ;; w3-grok-tag-or-data, but it's not clear which will be 2616 ;; w3-grok-tag-or-data, but it's not clear which will be
2610 ;; faster in practice. 2617 ;; faster in practice.
2611 (or (setq content (w3-element-content w3-p-d-current-element)) 2618 (or (setq content (w3-element-content w3-p-d-current-element))
2612 ;; *** Strictly speaking, in SGML the record end is 2619 ;; *** Strictly speaking, in SGML the record end is
2613 ;; carriage return, not line feed. 2620 ;; carriage return, not line feed.
2614 (if (eq ?\n (char-after between-tags-start)) 2621 (if (= ?\n (char-after between-tags-start))
2615 (setq between-tags-start (1+ between-tags-start)))) 2622 (setq between-tags-start (1+ between-tags-start))))
2616 (if (= between-tags-start (point)) 2623 (if (= between-tags-start (point))
2617 ;; Do nothing. 2624 ;; Do nothing.
2618 nil 2625 nil
2619 ;; We are definitely going to add data characters to the 2626 ;; We are definitely going to add data characters to the
2620 ;; content. 2627 ;; content.
2628 ;; Protocol is that all but last data character item
2629 ;; must have been sent to display engine.
2630 (and content
2631 (stringp (car content))
2632 ;; Gross, disgusting hack to deal with old interface
2633 ;; to display engine. Remove as soon as possible.
2634 (not (memq (w3-element-name w3-p-d-current-element)
2635 '(plaintext style xmp textarea)))
2636 (w3-add-display-item 'text (car content)))
2621 (cond 2637 (cond
2622 ((and (= ?\n (preceding-char)) 2638 ((and (= ?\n (preceding-char))
2623 (/= between-tags-start (1- (point)))) 2639 (/= between-tags-start (1- (point))))
2624 (setq content (cons (buffer-substring between-tags-start 2640 (setq content (cons (buffer-substring between-tags-start
2625 (1- (point))) 2641 (1- (point)))
2626 content)) 2642 content))
2643 ;; Gross, disgusting hack to deal with old interface
2644 ;; to display engine. Remove as soon as possible.
2645 (or (memq (w3-element-name w3-p-d-current-element)
2646 '(plaintext style xmp textarea))
2647 (w3-add-display-item 'text (car content)))
2627 (setq content (cons "\n" content))) 2648 (setq content (cons "\n" content)))
2628 (t 2649 (t
2629 (setq content (cons (buffer-substring between-tags-start 2650 (setq content (cons (buffer-substring between-tags-start
2630 (point)) 2651 (point))
2631 content)))) 2652 content))))
2738 (if (re-search-forward (if w3-p-d-null-end-tag-enabled 2759 (if (re-search-forward (if w3-p-d-null-end-tag-enabled
2739 "</[a-z>]\\|[/&]" 2760 "</[a-z>]\\|[/&]"
2740 "</[a-z>]\\|&") 2761 "</[a-z>]\\|&")
2741 nil 'move) 2762 nil 'move)
2742 (goto-char (match-beginning 0))) 2763 (goto-char (match-beginning 0)))
2743 (eq ?& (char-after (point)))) 2764 (= ?& (following-char)))
2744 (w3-expand-entity-at-point-maybe))))))) 2765 (w3-expand-entity-at-point-maybe)))))))
2745 (t 2766 (t
2746 ;; The element is illegal here. We'll just discard the start 2767 ;; The element is illegal here. We'll just discard the start
2747 ;; tag as though we never saw it. 2768 ;; tag as though we never saw it.
2748 )))) 2769 ))))
2751 (setq w3-p-d-end-tag-p nil) 2772 (setq w3-p-d-end-tag-p nil)
2752 (setq net-tag-p nil) 2773 (setq net-tag-p nil)
2753 (setq tag-attributes nil) 2774 (setq tag-attributes nil)
2754 (setq tag-end nil))) 2775 (setq tag-end nil)))
2755 2776
2777 ;; Hand items to the display engine.
2778 (cond ((not nodraw)
2779 (set-buffer w3-draw-buffer)
2780 (while (not (eq parse-tag-stream-last-displayed-item
2781 w3-p-d-parse-tag-stream-tail-pointer))
2782 (setq parse-tag-stream-last-displayed-item
2783 (cdr parse-tag-stream-last-displayed-item))
2784 ;; We call w3-handle-single-tag from only one spot so that it
2785 ;; is reasonable to inline it, since it is a big function.
2786 (w3-handle-single-tag
2787 (car (car parse-tag-stream-last-displayed-item))
2788 (cdr (car parse-tag-stream-last-displayed-item))))
2789 (set-buffer parse-buffer)))
2790
2756 ;; End of main while loop. 2791 ;; End of main while loop.
2757 ) 2792 )
2758 2793
2759 ;; We have finished parsing the buffer! 2794 ;; We have finished parsing the buffer!
2760 (if status-message-format 2795 (if status-message-format
2761 (message "%sdone" (format status-message-format 100))) 2796 (message "%sdone" (format status-message-format 100)))
2797 ;; Do this now so the user can see the full results before Emacs
2798 ;; goes off and garbage-collects for an hour. :-(
2799 (if w3-do-incremental-display
2800 (w3-pause))
2762 2801
2763 ;; *** For debugging, save the true parse tree. 2802 ;; *** For debugging, save the true parse tree.
2764 ;; *** Make this look inside *DOCUMENT. 2803 ;; *** Make this look inside *DOCUMENT.
2765 (setq w3-last-parse-tree 2804 (setq w3-last-parse-tree
2766 (w3-element-content w3-p-d-current-element)) 2805 (w3-element-content w3-p-d-current-element))
2767 2806
2768 (w3-element-content w3-p-d-current-element) 2807 ;; Return the parse in the format expected, a stream of tags
2808 ;; possibly with a buffer at the front.
2809 (if nodraw
2810 ;; Discard the *dummy item at start of list.
2811 (cdr parse-tag-stream)
2812 (cons w3-draw-buffer (cdr parse-tag-stream)))
2813
2769 ))) 2814 )))
2815
2816
2817 ;;;
2818 ;;; Initialization of display engine to accept parser output.
2819 ;;;
2820
2821 (defun w3-prepare-draw-buffer-for-parse-buffer ()
2822 (setq list-buffers-directory nil)
2823 (let ((buf (get-buffer-create (url-generate-new-buffer-name
2824 "Untitled")))
2825 (info (mapcar (function (lambda (x) (cons x (symbol-value x))))
2826 w3-persistent-variables)))
2827 (setq w3-draw-buffer buf)
2828 (save-excursion
2829 (set-window-buffer (selected-window) buf)
2830 (set-buffer buf)
2831 (setq w3-draw-buffer (current-buffer))
2832 (erase-buffer)
2833 (buffer-disable-undo (current-buffer))
2834 (mapcar (function (lambda (x) (set (car x) (cdr x)))) info)
2835 (setq w3-last-fill-pos (point))
2836 (setq fill-column (min (- (or w3-strict-width (window-width))
2837 w3-right-border)
2838 (or w3-maximum-line-length (window-width))))
2839 (setq fill-prefix "")
2840 (w3-init-state))))
2770 2841
2771 2842
2772 2843
2773 (provide 'w3-parse) 2844 (provide 'w3-parse)
2774 2845