comparison lisp/w3/w3-parse.el @ 14:9ee227acff29 r19-15b90

Import from CVS: tag r19-15b90
author cvs
date Mon, 13 Aug 2007 08:48:42 +0200
parents ac2d302a0011
children 0293115a14e9
comparison
equal deleted inserted replaced
13:13c6d0aaafe5 14:9ee227acff29
1 ;; Created by: Joe Wells, jbw@csb.bu.edu 1 ;; Created by: Joe Wells, jbw@cs.bu.edu
2 ;; Created on: Sat Sep 30 17:25:40 1995 2 ;; Created on: Sat Sep 30 17:25:40 1995
3 ;; Filename: w3-parse.el 3 ;; Filename: w3-parse.el
4 ;; Purpose: Parse HTML and/or SGML for Emacs W3 browser. 4 ;; Purpose: Parse HTML and/or SGML for Emacs W3 browser.
5 5
6 ;; Copyright © 1995, 1996 Joseph Brian Wells 6 ;; Copyright © 1995, 1996 Joseph Brian Wells
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of 15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details. 17 ;; GNU General Public License for more details.
18 ;; 18 ;;
19 ;; You should have received a copy of the GNU General Public License 19 ;; You should have received a copy of the GNU General Public License
20 ;; along with this program; if not, write to the Free Software 20 ;; along with GNU Emacs; see the file COPYING. If not, write to the
21 ;; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 21 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 ;; Boston, MA 02111-1307, USA.
22 ;; 23 ;;
23 ;; On November 13, 1995, the license was available at 24 ;; On November 13, 1995, the license was available at
24 ;; <URL:ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0>. It may still be 25 ;; <URL:ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0>. It may still be
25 ;; obtainable via that URL. 26 ;; obtainable via that URL.
26 27
55 ;; (remember this is really all one function). Some of the code which 56 ;; (remember this is really all one function). Some of the code which
56 ;; updates them is located inside the subfunctions. So that the compiler 57 ;; updates them is located inside the subfunctions. So that the compiler
57 ;; will not complain, these variables are defined with defvar. 58 ;; will not complain, these variables are defined with defvar.
58 59
59 (require 'w3-vars) 60 (require 'w3-vars)
61 (require 'mule-sysdp)
60 62
61 (eval-when-compile 63 (eval-when-compile
62 (defconst w3-p-s-var-list nil 64 (defconst w3-p-s-var-list nil
63 "A list of the scratch variables used by functions called by 65 "A list of the scratch variables used by functions called by
64 w3-parse-buffer which it is w3-parse-buffer's responsibility to 66 w3-parse-buffer which it is w3-parse-buffer's responsibility to
113 115
114 (defvar w3-p-d-open-element-stack) 116 (defvar w3-p-d-open-element-stack)
115 (put 'w3-p-d-open-element-stack 'variable-documentation 117 (put 'w3-p-d-open-element-stack 'variable-documentation
116 "A stack of the currently open elements, with the innermost enclosing 118 "A stack of the currently open elements, with the innermost enclosing
117 element on top and the outermost on bottom.") 119 element on top and the outermost on bottom.")
118
119 (defvar w3-p-d-parse-tag-stream-tail-pointer)
120 (put 'w3-p-d-parse-tag-stream-tail-pointer 'variable-documentation
121 "Points to last cons cell in parse-tag stream. We add items to tail of
122 parse-tag-stream instead of head.")
123 120
124 (defvar w3-p-d-shortrefs) 121 (defvar w3-p-d-shortrefs)
125 (put 'w3-p-d-shortrefs 'variable-documentation 122 (put 'w3-p-d-shortrefs 'variable-documentation
126 "An alist of the magic entity reference strings in the current 123 "An alist of the magic entity reference strings in the current
127 between-tags region and their replacements. Each item is of the format 124 between-tags region and their replacements. Each item is of the format
287 (let ((html-entities w3-html-entities)) 284 (let ((html-entities w3-html-entities))
288 (while html-entities 285 (while html-entities
289 (put (car (car html-entities)) 'html-entity-expansion 286 (put (car (car html-entities)) 'html-entity-expansion
290 (cons 'CDATA (if (integerp (cdr (car html-entities))) 287 (cons 'CDATA (if (integerp (cdr (car html-entities)))
291 (char-to-string 288 (char-to-string
292 (let ((c (cdr (car html-entities)))) 289 (mule-make-iso-character (cdr (car html-entities))))
293 (cond
294 ((and (> c 127) (boundp 'MULE))
295 (make-character lc-ltn1 c))
296 ;;((and (> c 127) (featurep 'mule))
297 ;; What???
298 ;;)
299 (t
300 c))))
301 (cdr (car html-entities))))) 290 (cdr (car html-entities)))))
302 (setq html-entities (cdr html-entities)))) 291 (setq html-entities (cdr html-entities))))
303 292
304 ;; These are handled differently than the normal HTML entities because 293 ;; These are handled differently than the normal HTML entities because
305 ;; we need to define the entities with 'nil instead of 'CDATA so 294 ;; we need to define the entities with 'nil instead of 'CDATA so
439 ;; char-to-string will hopefully do something useful with characters 428 ;; char-to-string will hopefully do something useful with characters
440 ;; larger than 255. I think in MULE it does. Is this true? 429 ;; larger than 255. I think in MULE it does. Is this true?
441 ;; Bill wants to call w3-resolve-numeric-entity here, but I think 430 ;; Bill wants to call w3-resolve-numeric-entity here, but I think
442 ;; that functionality belongs in char-to-string. 431 ;; that functionality belongs in char-to-string.
443 ;; The largest valid character in the I18N version of HTML is 65533. 432 ;; The largest valid character in the I18N version of HTML is 65533.
444 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt> 433 ;; ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt
445 ;; wrongo! Apparently, mule doesn't do sane things with char-to-string 434 ;; wrongo! Apparently, mule doesn't do sane things with char-to-string
446 ;; -wmp 7/9/96 435 ;; -wmp 7/9/96
447 (insert (char-to-string 436 (insert (char-to-string
448 (cond 437 (mule-make-iso-character w3-p-s-num))))
449 ((and (boundp 'MULE) (> w3-p-s-num 127))
450 (make-character lc-ltn1 w3-p-s-num))
451 ;;((and (featurep 'mule) (> w3-p-s-num 127))
452 ;;what??
453 ;;)
454 (t
455 w3-p-s-num)))))
456 ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r 438 ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r
457 (replace-match (assq (upcase (char-after (+ 3 (point)))) 439 (replace-match (assq (upcase (char-after (+ 3 (point))))
458 '(;; *** Strictly speaking, record end should be 440 '(;; *** Strictly speaking, record end should be
459 ;; carriage return. 441 ;; carriage return.
460 (?E . "\n") ; RE 442 (?E . "\n") ; RE
783 (concat "^&<" 765 (concat "^&<"
784 (w3-invalid-sgml-chars) 766 (w3-invalid-sgml-chars)
785 (if w3-p-d-null-end-tag-enabled "/" "") 767 (if w3-p-d-null-end-tag-enabled "/" "")
786 (if w3-p-d-in-parsed-marked-section "]" "") 768 (if w3-p-d-in-parsed-marked-section "]" "")
787 (or w3-p-d-shortref-chars "")))) 769 (or w3-p-d-shortref-chars ""))))
788
789 ;; Modifies free variable:
790 ;; w3-p-d-parse-tag-stream-tail-pointer
791 (defsubst w3-add-display-item (tag value)
792 (setcdr w3-p-d-parse-tag-stream-tail-pointer
793 (list (cons tag value)))
794 (setq w3-p-d-parse-tag-stream-tail-pointer
795 (cdr w3-p-d-parse-tag-stream-tail-pointer)))
796
797 ) 770 )
798 771
799 (eval-when-compile 772 (eval-when-compile
800 (w3-p-s-var-def w3-p-s-overrides) 773 (w3-p-s-var-def w3-p-s-overrides)
801 (w3-p-s-var-def w3-p-s-undo-list) 774 (w3-p-s-var-def w3-p-s-undo-list)
806 ;; w3-p-d-current-element, w3-p-d-open-element-stack 779 ;; w3-p-d-current-element, w3-p-d-open-element-stack
807 ;; Destroys free variables: 780 ;; Destroys free variables:
808 ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var 781 ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var
809 (defsubst w3-open-element (tag attributes) 782 (defsubst w3-open-element (tag attributes)
810 783
811 ;; Send trailing data character item in the old current element to
812 ;; display engine.
813 (if (stringp (car-safe (w3-element-content w3-p-d-current-element)))
814 (w3-add-display-item
815 'text
816 (car-safe (w3-element-content w3-p-d-current-element))))
817
818 ;; Push new element on stack. 784 ;; Push new element on stack.
819 (setq w3-p-d-open-element-stack (cons w3-p-d-current-element 785 (setq w3-p-d-open-element-stack (cons w3-p-d-current-element
820 w3-p-d-open-element-stack)) 786 w3-p-d-open-element-stack))
821 (setq w3-p-d-current-element (w3-fresh-element-for-tag tag)) 787 (setq w3-p-d-current-element (w3-fresh-element-for-tag tag))
822 788
872 (if (eq 'XINHERIT (w3-element-content-model w3-p-d-current-element)) 838 (if (eq 'XINHERIT (w3-element-content-model w3-p-d-current-element))
873 (w3-set-element-content-model 839 (w3-set-element-content-model
874 w3-p-d-current-element 840 w3-p-d-current-element
875 (w3-element-content-model (car w3-p-d-open-element-stack)))) 841 (w3-element-content-model (car w3-p-d-open-element-stack))))
876 842
877 ;; Send the start-tag and attributes to the display engine. 843 )
878 (if (memq tag '(plaintext style xmp textarea))
879 ;; Garbage special-casing for old display engine.
880 ;; Nothing is sent until end-tag is found.
881 ;; The DTD will ensure no subelements of these elements.
882 nil
883 ;; Normal procedure.
884 (w3-add-display-item tag attributes)))
885 ) 844 )
886 845
887 ;; The protocol for handing items to the display engine is as follows. 846 ;; The protocol for handing items to the display engine is as follows.
888 ;; 847 ;;
889 ;; For an element, send (START-TAG . ATTS), each member of the content, 848 ;; For an element, send (START-TAG . ATTS), each member of the content,
958 ;; any trailing data character item to display engine. 917 ;; any trailing data character item to display engine.
959 (setq w3-p-s-content (w3-element-content w3-p-d-current-element)) 918 (setq w3-p-s-content (w3-element-content w3-p-d-current-element))
960 (cond ((null w3-p-s-content)) 919 (cond ((null w3-p-s-content))
961 ((equal "\n" (car w3-p-s-content)) 920 ((equal "\n" (car w3-p-s-content))
962 (setq w3-p-s-content (cdr w3-p-s-content))) 921 (setq w3-p-s-content (cdr w3-p-s-content)))
963 ((and (stringp (car w3-p-s-content)) 922 )
964 ;; Garbage special-casing for old display engine.
965 (not (memq w3-p-s-end-tag
966 '(/plaintext /style /xmp /textarea))))
967 (w3-add-display-item 'text (car w3-p-s-content))))
968 923
969 ;; Send the end-tag to the display engine, but only if the element is 924 (cond ;; *** Handle LISTING the way the old parser did.
970 ;; allowed to have an end tag.
971 (cond ((memq w3-p-s-end-tag '(/plaintext /style /xmp /textarea))
972 ;; Garbage special-casing for old display engine.
973 ;; Format old display engine expects for these elements:
974 ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTRIBUTES))
975 (w3-add-display-item
976 ;; Use the *start*-tag, not the end-tag.
977 (w3-element-name w3-p-d-current-element)
978 (cons (cons 'data
979 (condition-case nil
980 (mapconcat 'identity w3-p-s-content "")
981 (error "eeek! subelement content!")))
982 (w3-element-attributes w3-p-d-current-element))))
983 ;; *** Handle LISTING the way the old parser did.
984 ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element)) 925 ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element))
985 ;; Do nothing, can't have an end tag. 926 ;; Do nothing, can't have an end tag.
986 ) 927 )
987 (t 928 (t
988 ;; Normal case. 929 ;; Normal case.
989 (w3-add-display-item w3-p-s-end-tag nil)
990 (if (null w3-p-s-content) 930 (if (null w3-p-s-content)
991 (w3-debug-html 931 (w3-debug-html
992 :bad-style :outer 932 :bad-style :outer
993 ;; Don't warn for empty TD elements or empty A elements 933 ;; Don't warn for empty TD elements or empty A elements
994 ;; with no HREF attribute. 934 ;; with no HREF attribute.
1173 1113
1174 ;; client-side imagemaps 1114 ;; client-side imagemaps
1175 (%imagemaps . (area map)) 1115 (%imagemaps . (area map))
1176 ;; special action is taken for %text inside %body.content in the 1116 ;; special action is taken for %text inside %body.content in the
1177 ;; content model of each element. 1117 ;; content model of each element.
1178 (%body.content . (%heading %block hr div address %imagemaps)) 1118 (%body.content . (%heading %block style hr div address %imagemaps))
1179 1119
1180 (%heading . (h1 h2 h3 h4 h5 h6)) 1120 (%heading . (h1 h2 h3 h4 h5 h6))
1181 1121
1182 ;; Emacs-w3 extensions 1122 ;; Emacs-w3 extensions
1183 (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek)) 1123 (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek))
1184 1124
1185 (%block . (p %list dl form %preformatted font 1125 (%block . (p %list dl form %preformatted
1186 %blockquote isindex fn table fig note 1126 %blockquote isindex fn table fig note
1187 center %block-deprecated %block-obsoleted)) 1127 center %block-deprecated %block-obsoleted))
1188 (%list . (ul ol)) 1128 (%list . (ul ol))
1189 (%preformatted . (pre)) 1129 (%preformatted . (pre))
1190 (%blockquote . (bq)) 1130 (%blockquote . (bq))
1194 ;; Why is IMG in this list? 1134 ;; Why is IMG in this list?
1195 (%pre.exclusion . (*include img *discard tab math big small sub sup)) 1135 (%pre.exclusion . (*include img *discard tab math big small sub sup))
1196 1136
1197 (%text . (*data b %notmath sub sup %emacsw3-crud)) 1137 (%text . (*data b %notmath sub sup %emacsw3-crud))
1198 (%notmath . (%special %font %phrase %misc)) 1138 (%notmath . (%special %font %phrase %misc))
1199 (%font . (i u s strike tt big small sub sup 1139 (%font . (i u s strike tt big small sub sup font
1200 roach secret wired)) ;; B left out for MATH 1140 roach secret wired)) ;; B left out for MATH
1201 (%phrase . (em strong dfn code samp kbd var cite blink)) 1141 (%phrase . (em strong dfn code samp kbd var cite blink))
1202 (%special . (a img applet font br script map math tab)) 1142 (%special . (a img applet object font basefont br script style map math tab span bdo))
1203 (%misc . (q lang au person acronym abbrev ins del)) 1143 (%misc . (q lang au person acronym abbrev ins del))
1204 1144
1205 (%formula . (*data %math)) 1145 (%formula . (*data %math))
1206 (%math . (box above below %mathvec root sqrt array sub sup 1146 (%math . (box above below %mathvec root sqrt array sub sup
1207 %mathface)) 1147 %mathface))
1275 ;; I haven't bothered to enumerate them. 1215 ;; I haven't bothered to enumerate them.
1276 (*close))]) 1216 (*close))])
1277 (end-tag-omissible . t)) 1217 (end-tag-omissible . t))
1278 ;; SCRIPT - - (#PCDATA) 1218 ;; SCRIPT - - (#PCDATA)
1279 ((script) 1219 ((script)
1280 (content-model . CDATA ; not official, but allows 1220 (content-model . XCDATA ; not official, but allows
1281 ; comment hiding of script 1221 ; comment hiding of script, and also
1222 ; idiots that use '</' in scripts.
1282 )) 1223 ))
1283 ;; TITLE - - (#PCDATA) 1224 ;; TITLE - - (#PCDATA)
1284 ((title) 1225 ((title)
1285 (content-model . RCDATA ; not official 1226 (content-model . RCDATA ; not official
1286 ;; [((*data) include-space nil nil)] 1227 ;; [((*data) include-space nil nil)]
1325 ((%heading) 1266 ((%heading)
1326 (content-model . [((%text) 1267 (content-model . [((%text)
1327 include-space 1268 include-space
1328 ((%in-text-ignore)) 1269 ((%in-text-ignore))
1329 nil)])) 1270 nil)]))
1271 ((span bdo)
1272 (content-model . [((%text)
1273 include-space
1274 nil
1275 nil)])
1276 )
1330 ((p) 1277 ((p)
1331 (content-model . [((%text) 1278 (content-model . [((%text)
1332 include-space 1279 include-space
1333 nil 1280 nil
1334 ;; *** Should only close if tag can 1281 ;; *** Should only close if tag can
1454 ;; Push <P> before data characters. Very non-SGML. 1401 ;; Push <P> before data characters. Very non-SGML.
1455 (((%text) p) 1402 (((%text) p)
1456 ((credit) *close)) 1403 ((credit) *close))
1457 nil)]) 1404 nil)])
1458 (end-tag-omissible . t)) 1405 (end-tag-omissible . t))
1459 ((%emacsw3-crud) 1406 ((%emacsw3-crud basefont)
1460 (content-model . EMPTY)) 1407 (content-model . EMPTY))
1461 ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA) 1408 ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA)
1462 ((form) 1409 ((form)
1463 ;; Same as BODY. Ugh! 1410 ;; Same as BODY. Ugh!
1464 (content-model . [((%body.content) 1411 (content-model . [((%body.content)
1714 nil)])) 1661 nil)]))
1715 ((frame) 1662 ((frame)
1716 (content-model . EMPTY)) 1663 (content-model . EMPTY))
1717 ;; 1664 ;;
1718 ;; APPLET is a Java thing. 1665 ;; APPLET is a Java thing.
1666 ;; OBJECT is a cougar thing
1719 ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README> 1667 ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README>
1720 ((applet) 1668 ((applet object)
1721 ;; I really don't want to add another ANY content-model. 1669 ;; I really don't want to add another ANY content-model.
1722 (content-model . XINHERIT) 1670 (content-model . XINHERIT)
1723 (inclusions . (param))) 1671 (inclusions . (param)))
1724 ((param) 1672 ((param)
1725 (content-model . EMPTY)) 1673 (content-model . EMPTY))
1747 (w3-p-s-var-def w3-p-s-state-transitions) 1695 (w3-p-s-var-def w3-p-s-state-transitions)
1748 (w3-p-s-var-def w3-p-s-transition) 1696 (w3-p-s-var-def w3-p-s-transition)
1749 (w3-p-s-var-def w3-p-s-tran-list) 1697 (w3-p-s-var-def w3-p-s-tran-list)
1750 (w3-p-s-var-def w3-p-s-content-model) 1698 (w3-p-s-var-def w3-p-s-content-model)
1751 (w3-p-s-var-def w3-p-s-except) 1699 (w3-p-s-var-def w3-p-s-except)
1700 (w3-p-s-var-def w3-p-s-baseobject)
1701 (w3-p-s-var-def w3-p-s-btdt)
1752 ;; Uses free variables: 1702 ;; Uses free variables:
1753 ;; w3-p-d-current-element, w3-p-d-exceptions 1703 ;; w3-p-d-current-element, w3-p-d-exceptions
1754 ;; Destroys free variables: 1704 ;; Destroys free variables:
1755 ;; w3-p-s-includep, w3-p-s-state-transitions, w3-p-s-transition, 1705 ;; w3-p-s-includep, w3-p-s-state-transitions, w3-p-s-transition,
1756 ;; w3-p-s-tran-list, w3-p-s-content-model, w3-p-s-except 1706 ;; w3-p-s-tran-list, w3-p-s-content-model, w3-p-s-except
1763 (w3-element-content-model w3-p-d-current-element))) 1713 (w3-element-content-model w3-p-d-current-element)))
1764 (or (and (memq w3-p-s-content-model 1714 (or (and (memq w3-p-s-content-model
1765 '(CDATA RCDATA XCDATA XXCDATA)) 1715 '(CDATA RCDATA XCDATA XXCDATA))
1766 (memq tag-name '(*data *space))) 1716 (memq tag-name '(*data *space)))
1767 ;; *** Implement ANY. 1717 ;; *** Implement ANY.
1768 (error "impossible")) 1718 (error "impossible content model lossage"))
1769 (setq w3-p-s-includep t) 1719 (setq w3-p-s-includep t)
1770 ;; Exit loop. 1720 ;; Exit loop.
1771 nil) 1721 nil)
1772 (t 1722 (t
1773 ;; We have a complex content model. 1723 ;; We have a complex content model.
1943 (w3-open-element (car w3-p-s-transition) nil) 1893 (w3-open-element (car w3-p-s-transition) nil)
1944 ;; Now we loop and try again in the new element's 1894 ;; Now we loop and try again in the new element's
1945 ;; content-model. 1895 ;; content-model.
1946 t) 1896 t)
1947 (t 1897 (t
1948 (error "impossible"))))))) 1898 (error "impossible transition")))))))
1949 1899
1950 ;; Empty while loop body. 1900 ;; Empty while loop body.
1951 ) 1901 )
1952 1902
1953 ;; Return value to user indicating whether to include or discard item: 1903 ;; Return value to user indicating whether to include or discard item:
1983 ;; % % 1933 ;; % %
1984 ;; % This is the *ONLY* valid entry point in this file! % 1934 ;; % This is the *ONLY* valid entry point in this file! %
1985 ;; % DO NOT call any of the other functions! % 1935 ;; % DO NOT call any of the other functions! %
1986 ;; % % 1936 ;; % %
1987 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1937 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1988 (defun w3-parse-buffer (&optional buff nodraw) 1938 (defun w3-parse-buffer (&optional buff)
1989 "Parse contents of BUFF as HTML. 1939 "Parse contents of BUFF as HTML.
1990 BUFF defaults to the value of url-working-buffer. 1940 BUFF defaults to the value of url-working-buffer.
1991 Destructively alters contents of BUFF. 1941 Destructively alters contents of BUFF.
1992 Unless optional second argument NODRAW is non-nil, calls the display
1993 engine on the parsed HTML.
1994 Returns a data structure containing the parsed information." 1942 Returns a data structure containing the parsed information."
1995 1943
1996 (set-buffer (or buff url-working-buffer)) 1944 (set-buffer (or buff url-working-buffer))
1997 (setq buff (current-buffer)) 1945 (setq buff (current-buffer))
1998 (set-syntax-table w3-sgml-md-syntax-table) 1946 (set-syntax-table w3-sgml-md-syntax-table)
2009 (goto-char (point-min)) 1957 (goto-char (point-min))
2010 1958
2011 ;; *** Should premunge line boundaries. 1959 ;; *** Should premunge line boundaries.
2012 ;; ******************** 1960 ;; ********************
2013 1961
2014 ;; Prepare another buffer to draw in unless told not to.
2015 (if (not nodraw)
2016 (w3-prepare-draw-buffer-for-parse-buffer))
2017
2018 (let* ( 1962 (let* (
2019 ;; Speed hack, see the variable doc string. 1963 ;; Speed hack, see the variable doc string.
2020 (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0) 1964 (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0)
2021 (* w3-gc-cons-threshold-multiplier 1965 (* w3-gc-cons-threshold-multiplier
2022 gc-cons-threshold) 1966 gc-cons-threshold)
2036 (format "Parsed %%3d%%%% of %d..." (- (point-max) (point-min))))) 1980 (format "Parsed %%3d%%%% of %d..." (- (point-max) (point-min)))))
2037 1981
2038 ;; Use a float value for 100 if possible, otherwise integer. 1982 ;; Use a float value for 100 if possible, otherwise integer.
2039 ;; Determine which we can use outside of the loop for speed. 1983 ;; Determine which we can use outside of the loop for speed.
2040 (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100)) 1984 (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100))
2041
2042 ;; Speed up checking whether to do incremental display.
2043 (w3-do-incremental-display (if nodraw nil w3-do-incremental-display))
2044
2045 ;; Used to convert parse tree to tag stream that old display
2046 ;; engine expects. Will change when display engine is rewritten.
2047 (parse-tag-stream '(*dummy))
2048
2049 ;; See doc string.
2050 (w3-p-d-parse-tag-stream-tail-pointer parse-tag-stream)
2051
2052 ;; Points to cons cell in parse-tag-stream whose car is the last
2053 ;; item that has been sent to display engine.
2054 (parse-tag-stream-last-displayed-item parse-tag-stream)
2055 1985
2056 ;; The buffer which contains the HTML we are parsing. This 1986 ;; The buffer which contains the HTML we are parsing. This
2057 ;; variable is used to avoid using the more expensive 1987 ;; variable is used to avoid using the more expensive
2058 ;; save-excursion. 1988 ;; save-excursion.
2059 (parse-buffer (current-buffer)) 1989 (parse-buffer (current-buffer))
2136 ;; Scratch variables used in this function 2066 ;; Scratch variables used in this function
2137 ref attr-name attr-value content-model content open-list 2067 ref attr-name attr-value content-model content open-list
2138 ) 2068 )
2139 ;; Scratch variables used by macros and defsubsts we call. 2069 ;; Scratch variables used by macros and defsubsts we call.
2140 (w3-p-s-let-bindings 2070 (w3-p-s-let-bindings
2141
2142 (w3-update-non-markup-chars) 2071 (w3-update-non-markup-chars)
2143 2072 (setq w3-p-s-baseobject (url-generic-parse-url (url-view-url t)))
2144 ;; Main loop. Handle markup as follows: 2073 ;; Main loop. Handle markup as follows:
2145 ;; 2074 ;;
2146 ;; non-empty tag: Handle the region since the previous tag as PCDATA, 2075 ;; non-empty tag: Handle the region since the previous tag as PCDATA,
2147 ;; RCDATA, CDATA, if allowed by syntax. Then handle the tag. 2076 ;; RCDATA, CDATA, if allowed by syntax. Then handle the tag.
2148 ;; 2077 ;;
2171 (setq last-loop-start (point))) 2100 (setq last-loop-start (point)))
2172 2101
2173 ;; Display progress messages if asked and/or do incremental display 2102 ;; Display progress messages if asked and/or do incremental display
2174 ;; of results 2103 ;; of results
2175 (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40)) 2104 (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40))
2176 (if w3-do-incremental-display
2177 (w3-pause))
2178 (if status-message-format 2105 (if status-message-format
2179 (message status-message-format 2106 (message status-message-format
2180 ;; Percentage of buffer processed. 2107 ;; Percentage of buffer processed.
2181 (/ (* (point) one-hundred) (point-max)))))) 2108 (/ (* (point) one-hundred) (point-max))))))
2182 2109
2194 (forward-char) 2121 (forward-char)
2195 (cond 2122 (cond
2196 2123
2197 ((looking-at "/?\\([a-z][-a-z0-9.]*\\)") 2124 ((looking-at "/?\\([a-z][-a-z0-9.]*\\)")
2198 ;; We are looking at a non-empty tag. 2125 ;; We are looking at a non-empty tag.
2199 2126
2127 ;; Downcase it in the buffer, to save creation of a string
2128 (downcase-region (match-beginning 1) (match-end 1))
2200 (setq w3-p-d-tag-name 2129 (setq w3-p-d-tag-name
2201 (intern (downcase (buffer-substring (match-beginning 1) 2130 (intern (buffer-substring (match-beginning 1)
2202 (match-end 1))))) 2131 (match-end 1))))
2203 (setq w3-p-d-end-tag-p (= ?/ (following-char))) 2132 (setq w3-p-d-end-tag-p (= ?/ (following-char)))
2204 (setq between-tags-end (1- (point))) 2133 (setq between-tags-end (1- (point)))
2205 (goto-char (match-end 0)) 2134 (goto-char (match-end 0))
2206 2135
2207 ;; Read the attributes from a start-tag. 2136 ;; Read the attributes from a start-tag.
2208 (or 2137 (if w3-p-d-end-tag-p
2209 w3-p-d-end-tag-p 2138 (if (looking-at "[ \t\r\n/]*>")
2139 nil
2140 ;; This is in here to deal with those idiots who stick
2141 ;; attribute/value pairs on end tags. *sigh*
2142 (w3-debug-html "Evil attributes on end tag.")
2143 (skip-chars-forward "^>"))
2210 2144
2211 ;; Attribute values can be: 2145 ;; Attribute values can be:
2212 ;; "STRING" where STRING does not contain the double quote 2146 ;; "STRING" where STRING does not contain the double quote
2213 ;; 'STRING' where STRING does not contain the single quote 2147 ;; 'STRING' where STRING does not contain the single quote
2214 ;; name-start character, *name character 2148 ;; name-start character, *name character
2225 (concat 2159 (concat
2226 ;; Leading whitespace. 2160 ;; Leading whitespace.
2227 "[ \n\r\t]*" 2161 "[ \n\r\t]*"
2228 ;; The attribute name, possibly with a bad syntax 2162 ;; The attribute name, possibly with a bad syntax
2229 ;; component. 2163 ;; component.
2230 "\\([a-z][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)" 2164 "\\([a-z_][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)"
2231 ;; Trailing whitespace and perhaps an "=". 2165 ;; Trailing whitespace and perhaps an "=".
2232 "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)"))) 2166 "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)")))
2233 2167
2234 (cond ((/= (match-beginning 2) (match-end 2)) 2168 (cond ((/= (match-beginning 2) (match-end 2))
2235 (w3-debug-html 2169 (w3-debug-html
2236 :nocontext 2170 :nocontext
2237 (format "Bad attribute name syntax: %s" 2171 (format "Bad attribute name syntax: %s"
2238 (buffer-substring (match-beginning 1) 2172 (buffer-substring (match-beginning 1)
2239 (match-end 1)))))) 2173 (match-end 1))))))
2240 2174
2175 ;; Downcase it in the buffer, to save creation of a string
2176 (downcase-region (match-beginning 1) (match-end 1))
2241 (setq attr-name 2177 (setq attr-name
2242 (intern (downcase (buffer-substring (match-beginning 1) 2178 (intern (buffer-substring (match-beginning 1)
2243 (match-end 1))))) 2179 (match-end 1))))
2244 (goto-char (match-end 0)) 2180 (goto-char (match-end 0))
2245 (cond 2181 (cond
2246 ((< (match-beginning 4) (match-end 4)) 2182 ((< (match-beginning 4) (match-end 4))
2247 ;; A value was specified (e.g. ATTRIBUTE=VALUE). 2183 ;; A value was specified (e.g. ATTRIBUTE=VALUE).
2248 (cond 2184 (cond
2251 (concat 2187 (concat
2252 ;; Literal with double quotes. 2188 ;; Literal with double quotes.
2253 "\"\\([^\"]*\\)\"" 2189 "\"\\([^\"]*\\)\""
2254 "\\|" 2190 "\\|"
2255 ;; Literal with single quotes. 2191 ;; Literal with single quotes.
2256 "'\\([^']\\)*'" 2192 "'\\([^']*\\)'"
2257 "\\|" 2193 "\\|"
2258 ;; Handle bad HTML conflicting with NET-enabling 2194 ;; Handle bad HTML conflicting with NET-enabling
2259 ;; start-tags. 2195 ;; start-tags.
2260 "\\([-a-z0-9.]+/[-a-z0-9._/#]+\\)[ \t\n\r>]" 2196 "\\([-a-z0-9.]+/[-a-z0-9._/#]+\\)[ \t\n\r>]"
2261 "\\|" 2197 "\\|"
2284 (goto-char (point-min)) 2220 (goto-char (point-min))
2285 (while (progn 2221 (while (progn
2286 (skip-chars-forward "^&") 2222 (skip-chars-forward "^&")
2287 (not (eobp))) 2223 (not (eobp)))
2288 (w3-expand-entity-at-point-maybe)) 2224 (w3-expand-entity-at-point-maybe))
2289 (subst-char-in-region (point-min) (point-max) ?\t 32) 2225 (subst-char-in-region (point-min) (point-max) ?\t ? )
2290 (subst-char-in-region (point-min) (point-max) ?\n 32)) 2226 (subst-char-in-region (point-min) (point-max) ?\n ? ))
2291 ;; Set this after we have changed the size of the 2227 ;; Set this after we have changed the size of the
2292 ;; attribute. 2228 ;; attribute.
2293 (setq attribute-value-end (1+ (point-max)))) 2229 (setq attribute-value-end (1+ (point-max))))
2294 ((match-beginning 4) 2230 ((match-beginning 4)
2295 (setq attribute-value-end (match-end 4)) 2231 (setq attribute-value-end (match-end 4))
2303 (match-end 3))) 2239 (match-end 3)))
2304 (w3-debug-html :nocontext 2240 (w3-debug-html :nocontext
2305 (format "Evil attribute value syntax: %s" 2241 (format "Evil attribute value syntax: %s"
2306 (buffer-substring (point-min) (point-max))))) 2242 (buffer-substring (point-min) (point-max)))))
2307 (t 2243 (t
2308 (error "impossible")))) 2244 (error "impossible attribute value"))))
2309 ((memq (following-char) '(?\" ?')) 2245 ((memq (following-char) '(?\" ?'))
2310 ;; Missing terminating quote character. 2246 ;; Missing terminating quote character.
2311 (narrow-to-region (point) 2247 (narrow-to-region (point)
2312 (progn 2248 (progn
2313 (forward-char 1) 2249 (forward-char 1)
2314 (skip-chars-forward "^ \t\n\r'\"=<>") 2250 (skip-chars-forward "^ \t\n\r'\"<>")
2315 (setq attribute-value-end (point)))) 2251 (setq attribute-value-end (point))))
2316 (w3-debug-html :nocontext 2252 (w3-debug-html :nocontext
2317 (format "Attribute value missing end quote: %s" 2253 (format "Attribute value missing end quote: %s"
2318 (buffer-substring (point-min) (point-max)))) 2254 (buffer-substring (point-min) (point-max))))
2319 (narrow-to-region (1+ (point-min)) (point-max))) 2255 (narrow-to-region (1+ (point-min)) (point-max)))
2320 (t 2256 (t
2321 ;; We have a syntactically invalid attribute value. Let's 2257 ;; We have a syntactically invalid attribute value. Let's
2322 ;; make a best guess as to what the author intended. 2258 ;; make a best guess as to what the author intended.
2323 (narrow-to-region (point) 2259 (narrow-to-region (point)
2324 (progn 2260 (progn
2325 (skip-chars-forward "^ \t\n\r'\"=<>") 2261 (skip-chars-forward "^ \t\n\r'\"<>")
2326 (setq attribute-value-end (point)))) 2262 (setq attribute-value-end (point))))
2327 (w3-debug-html :nocontext 2263 (w3-debug-html :nocontext
2328 (format "Bad attribute value syntax: %s" 2264 (format "Bad attribute value syntax: %s"
2329 (buffer-substring (point-min) (point-max)))))) 2265 (buffer-substring (point-min) (point-max))))))
2330 ;; Now we have isolated the attribute value. We need to 2266 ;; Now we have isolated the attribute value. We need to
2337 ;; * smash case 2273 ;; * smash case
2338 ;; * remove leading/trailing whitespace 2274 ;; * remove leading/trailing whitespace
2339 ;; * smash multiple space sequences into single spaces 2275 ;; * smash multiple space sequences into single spaces
2340 ;; * verify the syntax of each token 2276 ;; * verify the syntax of each token
2341 (setq attr-value (buffer-substring (point-min) (point-max))) 2277 (setq attr-value (buffer-substring (point-min) (point-max)))
2278 (case attr-name
2279 (class
2280 (setq attr-value (split-string attr-value "[ ,]+")))
2281 (align
2282 (if (string-match "^[ \t\r\n]*\\(.*\\)[ \t\r\n]*$"
2283 attr-value)
2284 (setq attr-value (downcase
2285 (substring attr-value
2286 (match-beginning 1)
2287 (match-end 1))))
2288 (setq attr-value (downcase attr-value)))
2289 (setq attr-value (intern attr-value)))
2290 ((src href)
2291 ;; I should expand URLs here
2292 )
2293 (otherwise nil)
2294 )
2342 (widen) 2295 (widen)
2343 (goto-char attribute-value-end)) 2296 (goto-char attribute-value-end))
2344 (t 2297 (t
2345 ;; No value was specified, in which case NAME should be 2298 ;; No value was specified, in which case NAME should be
2346 ;; taken as ATTRIBUTE=NAME where NAME is one of the 2299 ;; taken as ATTRIBUTE=NAME where NAME is one of the
2350 ;; is wrong. 2303 ;; is wrong.
2351 (setq attr-value (symbol-name attr-name)))) 2304 (setq attr-value (symbol-name attr-name))))
2352 2305
2353 ;; Accumulate the attributes. 2306 ;; Accumulate the attributes.
2354 (setq tag-attributes (cons (cons attr-name attr-value) 2307 (setq tag-attributes (cons (cons attr-name attr-value)
2355 tag-attributes)))) 2308 tag-attributes)))
2309
2310 (cond
2311 ((and (eq w3-p-d-tag-name 'base)
2312 (setq w3-p-s-baseobject
2313 (or (assq 'src tag-attributes)
2314 (assq 'href tag-attributes))))
2315 (setq w3-p-s-baseobject (url-generic-parse-url
2316 (cdr w3-p-s-baseobject))))
2317 ((setq w3-p-s-btdt (or (assq 'src tag-attributes)
2318 (assq 'href tag-attributes)
2319 (assq 'action tag-attributes)))
2320 (setcdr w3-p-s-btdt (url-expand-file-name (cdr w3-p-s-btdt)
2321 w3-p-s-baseobject))
2322 (setq w3-p-s-btdt (if (url-have-visited-url (cdr w3-p-s-btdt))
2323 ":visited"
2324 ":link"))
2325 (if (assq 'class tag-attributes)
2326 (setcdr (assq 'class tag-attributes)
2327 (cons w3-p-s-btdt
2328 (cdr (assq 'class tag-attributes))))
2329 (setq tag-attributes (cons (cons 'class (list w3-p-s-btdt))
2330 tag-attributes))))
2331 )
2332 )
2356 2333
2357 ;; Process the end of the tag. 2334 ;; Process the end of the tag.
2358 (skip-chars-forward " \t\n\r") 2335 (skip-chars-forward " \t\n\r")
2359 (cond ((= ?> (following-char)) 2336 (cond ((= ?> (following-char))
2360 ;; Ordinary tag end. 2337 ;; Ordinary tag end.
2471 ((memq 'INCLUDE keywords)) 2448 ((memq 'INCLUDE keywords))
2472 ((memq 'TEMP keywords)))))) 2449 ((memq 'TEMP keywords))))))
2473 (or (= ?\[ (following-char)) 2450 (or (= ?\[ (following-char))
2474 ;; I probably shouldn't even check this, since it is so 2451 ;; I probably shouldn't even check this, since it is so
2475 ;; impossible. 2452 ;; impossible.
2476 (error "impossible")) 2453 (error "impossible ??"))
2477 (forward-char 1) 2454 (forward-char 1)
2478 (delete-region (1- (match-beginning 0)) (point)) 2455 (delete-region (1- (match-beginning 0)) (point))
2479 (cond ((eq 'IGNORE keyword) 2456 (cond ((eq 'IGNORE keyword)
2480 ;; Scan forward skipping over matching <![ ... ]]> 2457 ;; Scan forward skipping over matching <![ ... ]]>
2481 ;; until we find an unmatched "]]>". 2458 ;; until we find an unmatched "]]>".
2623 (if (= between-tags-start (point)) 2600 (if (= between-tags-start (point))
2624 ;; Do nothing. 2601 ;; Do nothing.
2625 nil 2602 nil
2626 ;; We are definitely going to add data characters to the 2603 ;; We are definitely going to add data characters to the
2627 ;; content. 2604 ;; content.
2628 ;; Protocol is that all but last data character item
2629 ;; must have been sent to display engine.
2630 (and content
2631 (stringp (car content))
2632 ;; Gross, disgusting hack to deal with old interface
2633 ;; to display engine. Remove as soon as possible.
2634 (not (memq (w3-element-name w3-p-d-current-element)
2635 '(plaintext style xmp textarea)))
2636 (w3-add-display-item 'text (car content)))
2637 (cond 2605 (cond
2638 ((and (= ?\n (preceding-char)) 2606 ((and (= ?\n (preceding-char))
2639 (/= between-tags-start (1- (point)))) 2607 (/= between-tags-start (1- (point))))
2640 (setq content (cons (buffer-substring between-tags-start 2608 (setq content (cons (buffer-substring between-tags-start
2641 (1- (point))) 2609 (1- (point)))
2642 content)) 2610 content))
2643 ;; Gross, disgusting hack to deal with old interface
2644 ;; to display engine. Remove as soon as possible.
2645 (or (memq (w3-element-name w3-p-d-current-element)
2646 '(plaintext style xmp textarea))
2647 (w3-add-display-item 'text (car content)))
2648 (setq content (cons "\n" content))) 2611 (setq content (cons "\n" content)))
2649 (t 2612 (t
2650 (setq content (cons (buffer-substring between-tags-start 2613 (setq content (cons (buffer-substring between-tags-start
2651 (point)) 2614 (point))
2652 content)))) 2615 content))))
2772 (setq w3-p-d-end-tag-p nil) 2735 (setq w3-p-d-end-tag-p nil)
2773 (setq net-tag-p nil) 2736 (setq net-tag-p nil)
2774 (setq tag-attributes nil) 2737 (setq tag-attributes nil)
2775 (setq tag-end nil))) 2738 (setq tag-end nil)))
2776 2739
2777 ;; Hand items to the display engine.
2778 (cond ((not nodraw)
2779 (set-buffer w3-draw-buffer)
2780 (while (not (eq parse-tag-stream-last-displayed-item
2781 w3-p-d-parse-tag-stream-tail-pointer))
2782 (setq parse-tag-stream-last-displayed-item
2783 (cdr parse-tag-stream-last-displayed-item))
2784 ;; We call w3-handle-single-tag from only one spot so that it
2785 ;; is reasonable to inline it, since it is a big function.
2786 (w3-handle-single-tag
2787 (car (car parse-tag-stream-last-displayed-item))
2788 (cdr (car parse-tag-stream-last-displayed-item))))
2789 (set-buffer parse-buffer)))
2790
2791 ;; End of main while loop. 2740 ;; End of main while loop.
2792 ) 2741 )
2793 2742
2794 ;; We have finished parsing the buffer! 2743 ;; We have finished parsing the buffer!
2795 (if status-message-format 2744 (if status-message-format
2796 (message "%sdone" (format status-message-format 100))) 2745 (message "%sdone" (format status-message-format 100)))
2797 ;; Do this now so the user can see the full results before Emacs
2798 ;; goes off and garbage-collects for an hour. :-(
2799 (if w3-do-incremental-display
2800 (w3-pause))
2801 2746
2802 ;; *** For debugging, save the true parse tree. 2747 ;; *** For debugging, save the true parse tree.
2803 ;; *** Make this look inside *DOCUMENT. 2748 ;; *** Make this look inside *DOCUMENT.
2804 (setq w3-last-parse-tree 2749 (setq w3-last-parse-tree
2805 (w3-element-content w3-p-d-current-element)) 2750 (w3-element-content w3-p-d-current-element))
2806 2751
2807 ;; Return the parse in the format expected, a stream of tags 2752 (w3-element-content w3-p-d-current-element)
2808 ;; possibly with a buffer at the front.
2809 (if nodraw
2810 ;; Discard the *dummy item at start of list.
2811 (cdr parse-tag-stream)
2812 (cons w3-draw-buffer (cdr parse-tag-stream)))
2813
2814 ))) 2753 )))
2815
2816
2817 ;;;
2818 ;;; Initialization of display engine to accept parser output.
2819 ;;;
2820
2821 (defun w3-prepare-draw-buffer-for-parse-buffer ()
2822 (setq list-buffers-directory nil)
2823 (let ((buf (get-buffer-create (url-generate-new-buffer-name
2824 "Untitled")))
2825 (info (mapcar (function (lambda (x) (cons x (symbol-value x))))
2826 w3-persistent-variables)))
2827 (setq w3-draw-buffer buf)
2828 (save-excursion
2829 (set-window-buffer (selected-window) buf)
2830 (set-buffer buf)
2831 (setq w3-draw-buffer (current-buffer))
2832 (erase-buffer)
2833 (buffer-disable-undo (current-buffer))
2834 (mapcar (function (lambda (x) (set (car x) (cdr x)))) info)
2835 (setq w3-last-fill-pos (point))
2836 (setq fill-column (min (- (or w3-strict-width (window-width))
2837 w3-right-border)
2838 (or w3-maximum-line-length (window-width))))
2839 (setq fill-prefix "")
2840 (w3-init-state))))
2841 2754
2842 2755
2843 2756
2844 (provide 'w3-parse) 2757 (provide 'w3-parse)
2845 2758