80
|
1 ;; Created by: Joe Wells, jbw@cs.bu.edu
|
0
|
2 ;; Created on: Sat Sep 30 17:25:40 1995
|
|
3 ;; Filename: w3-parse.el
|
|
4 ;; Purpose: Parse HTML and/or SGML for Emacs W3 browser.
|
|
5
|
82
|
6 ;; Copyright © 1995, 1996, 1997 Joseph Brian Wells
|
2
|
7 ;; Copyright © 1993, 1994, 1995 by William M. Perry (wmperry@cs.indiana.edu)
|
0
|
8 ;;
|
|
9 ;; This program is free software; you can redistribute it and/or modify
|
|
10 ;; it under the terms of the GNU General Public License as published by
|
|
11 ;; the Free Software Foundation; either version 2 of the License, or
|
|
12 ;; (at your option) any later version.
|
|
13 ;;
|
|
14 ;; This program is distributed in the hope that it will be useful,
|
|
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
17 ;; GNU General Public License for more details.
|
|
18 ;;
|
|
19 ;; You should have received a copy of the GNU General Public License
|
80
|
20 ;; along with GNU Emacs; see the file COPYING. If not, write to the
|
|
21 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
22 ;; Boston, MA 02111-1307, USA.
|
0
|
23 ;;
|
|
24 ;; On November 13, 1995, the license was available at
|
|
25 ;; <URL:ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0>. It may still be
|
|
26 ;; obtainable via that URL.
|
|
27
|
|
28
|
|
29 ;;;
|
|
30 ;;; Trying to make the best of an evil speed hack.
|
|
31 ;;;
|
|
32
|
|
33 ;; Explanation:
|
|
34
|
|
35 ;; Basically, this file provides one big function (w3-parse-buffer) and
|
|
36 ;; some data structures. However, to avoid code redundancy, I have broken
|
|
37 ;; out some common subexpressions of w3-parse-buffer into separate
|
|
38 ;; functions. I have declared these separate functions with "defsubst" so
|
|
39 ;; they will be inlined into w3-parse-buffer. Also, I have defined them
|
|
40 ;; within eval-when-compile forms, so no definitions will be emitted into
|
|
41 ;; the .elc file for these separate functions. (They will work normally
|
|
42 ;; when the uncompiled file is loaded.)
|
|
43
|
|
44 ;; Each of these subfunctions use some scratch variables in a purely local
|
|
45 ;; fashion. In good software design, I would declare these variables as
|
|
46 ;; close to their use as possible with "let". However, "let"-binding
|
|
47 ;; variables is *SLOW* in Emacs Lisp, even when compiled. Since each of
|
|
48 ;; these functions is executed one or more time during each iteration of
|
|
49 ;; the main loop, I deemed this too expensive. So the main function does
|
|
50 ;; the "let"-binding of these variables. However, I still want to declare
|
|
51 ;; them close to their use, partially to keep the compiler from crying
|
|
52 ;; "Wolf!" when there is no danger (well, maybe a little danger :-), so I
|
|
53 ;; define some macros for this purpose.
|
|
54
|
|
55 ;; Also, there are some variables which are updated throughout the file
|
|
56 ;; (remember this is really all one function). Some of the code which
|
|
57 ;; updates them is located inside the subfunctions. So that the compiler
|
|
58 ;; will not complain, these variables are defined with defvar.
|
|
59
|
|
60 (require 'w3-vars)
|
80
|
61 (require 'mule-sysdp)
|
0
|
62
|
|
63 (eval-when-compile
|
|
64 (defconst w3-p-s-var-list nil
|
|
65 "A list of the scratch variables used by functions called by
|
|
66 w3-parse-buffer which it is w3-parse-buffer's responsibility to
|
|
67 \"let\"-bind.")
|
|
68
|
|
69 ;; *** This is unused and does not belong right here anyway.
|
|
70 (defmacro w3-resolve-numeric-entity (ent)
|
|
71 "Return a string representing the numeric entity ENT (&#ENT;)"
|
|
72 (` (if (< (, ent) 256)
|
|
73 (char-to-string (, ent))
|
|
74 (format "[Too large character: %s]" (, ent)))))
|
|
75
|
|
76 (defmacro w3-p-s-var-def (var)
|
|
77 "Declare VAR as a scratch variable which w3-parse-buffer must
|
|
78 \"let\"-bind."
|
|
79 (` (eval-when-compile
|
|
80 (defvar (, var))
|
|
81 (or (memq '(, var) w3-p-s-var-list)
|
|
82 (setq w3-p-s-var-list (cons '(, var) w3-p-s-var-list))))))
|
|
83
|
|
84 (defmacro w3-p-s-let-bindings (&rest body)
|
|
85 "\"let\"-bind all of the variables in w3-p-s-var-list in BODY."
|
|
86 (` (let (, w3-p-s-var-list)
|
|
87 (,@ body))))
|
|
88 (put 'w3-p-s-let-bindings 'lisp-indent-function 0)
|
|
89 (put 'w3-p-s-let-bindings 'edebug-form-spec t)
|
|
90
|
|
91 (defvar w3-p-d-current-element)
|
|
92 (put 'w3-p-d-current-element 'variable-documentation
|
|
93 "Information structure for the current open element.")
|
|
94
|
|
95 (defvar w3-p-d-exceptions)
|
|
96 (put 'w3-p-d-exceptions 'variable-documentation
|
|
97 "Alist specifying elements (dis)allowed because of an (ex|in)clusion
|
|
98 exception of some containing element (not necessarily the immediately
|
|
99 containing element). Each item specifies a transition for an element
|
|
100 which overrides that specified by the current element's content model.
|
|
101 Each item is of the form (TAG ACTION *same ERRORP).")
|
|
102
|
|
103 (defvar w3-p-d-in-parsed-marked-section)
|
|
104 (put 'w3-p-d-in-parsed-marked-section 'variable-documentation
|
|
105 "Are we in a parsed marked section so that we have to scan for \"]]>\"?")
|
|
106
|
|
107 (defvar w3-p-d-non-markup-chars)
|
|
108 (put 'w3-p-d-non-markup-chars 'variable-documentation
|
|
109 "The characters that do not indicate the start of markup, in the format
|
|
110 for an argument to skip-chars-forward.")
|
|
111
|
|
112 (defvar w3-p-d-null-end-tag-enabled)
|
|
113 (put 'w3-p-d-null-end-tag-enabled 'variable-documentation
|
|
114 "Is the null end tag (\"/\") enabled?")
|
|
115
|
|
116 (defvar w3-p-d-open-element-stack)
|
|
117 (put 'w3-p-d-open-element-stack 'variable-documentation
|
|
118 "A stack of the currently open elements, with the innermost enclosing
|
|
119 element on top and the outermost on bottom.")
|
|
120
|
|
121 (defvar w3-p-d-shortrefs)
|
|
122 (put 'w3-p-d-shortrefs 'variable-documentation
|
|
123 "An alist of the magic entity reference strings in the current
|
|
124 between-tags region and their replacements. Each item is of the format
|
|
125 \(REGEXP . REPLACEMENT-STRING\). Although in SGML shortrefs normally name
|
|
126 entities whose value should be used as the replacement, we have
|
|
127 preexpanded the entities for speed. We have also regexp-quoted the
|
|
128 strings to be replaced, so they can be used with looking-at. This should
|
|
129 never be in an element's overrides field unless
|
|
130 w3-p-d-shortref-chars is also in the field.")
|
|
131
|
|
132 (defvar w3-p-d-shortref-chars)
|
|
133 (put 'w3-p-d-shortref-chars 'variable-documentation
|
|
134 "A string of the characters which can start shortrefs in the current
|
|
135 between-tags region. This must be in a form which can be passed to
|
|
136 skip-chars-forward and must contain exactly the characters which start the
|
|
137 entries in w3-p-d-shortrefs. If this variable is mentioned in the
|
|
138 overrides field of an element, its handling is magical in that the
|
|
139 variable w3-p-d-non-markup-chars is saved to the element's undo-list and
|
|
140 updated at the same time. This should never be in an element's overrides
|
|
141 field unless w3-p-d-shortrefs is also in the field.")
|
|
142
|
|
143 (defvar w3-p-d-tag-name)
|
|
144 (put 'w3-p-d-tag-name 'variable-documentation
|
|
145 "Name of tag we are looking at, as an Emacs Lisp symbol.
|
|
146 Only non-nil when we are looking at a tag.")
|
|
147
|
|
148 (defvar w3-p-d-end-tag-p)
|
|
149 (put 'w3-p-d-end-tag-p 'variable-documentation
|
|
150 "Is the tag we are looking at an end tag?
|
|
151 Only non-nil when we are looking at a tag.")
|
|
152
|
|
153 )
|
|
154
|
|
155
|
|
156 ;;;
|
|
157 ;;; HTML syntax error messages.
|
|
158 ;;;
|
|
159
|
|
160 (eval-when-compile
|
|
161
|
|
162 (defvar w3-p-d-debug-url)
|
|
163 (put 'w3-p-d-debug-url 'variable-documentation
|
|
164 "Whether to print the URL being parsed before an error messages.
|
|
165 Only true for the first error message.")
|
|
166
|
|
167 ;; The level parameter indicates whether the error is (1) very
|
|
168 ;; serious, must be displayed to all users, (2) invalid HTML, but the
|
|
169 ;; user should only be told if the user has indicated interest, or (3)
|
|
170 ;; valid HTML which is bad because it appears to rely on the way certain
|
|
171 ;; browsers will display it, which should only be displayed to the user
|
|
172 ;; if they have really asked for it.
|
|
173
|
|
174 (defmacro w3-debug-html (&rest body)
|
|
175 "Emit a warning message.
|
|
176 These keywords may be used at the beginning of the arguments:
|
|
177 :mandatory-if sexp -- force printing if sexp evaluates non-nil.
|
|
178 :bad-style -- do not print unless w3-debug-html is 'style.
|
|
179 :outer -- do not include the current element in the element
|
|
180 context we report.
|
|
181 :nocontext -- do not include context where error detected.
|
|
182 The remaining parameters are treated as the body of a progn, the value of
|
|
183 which must be a string to use as the error message."
|
|
184 (let (mandatory-if bad-style outer nocontext condition)
|
|
185 (while (memq (car body) '(:mandatory-if :bad-style :outer :nocontext))
|
|
186 (cond ((eq ':mandatory-if (car body))
|
|
187 (setq mandatory-if (car (cdr body)))
|
|
188 (setq body (cdr (cdr body))))
|
|
189 ((eq ':bad-style (car body))
|
|
190 (setq bad-style t)
|
|
191 (setq body (cdr body)))
|
|
192 ((eq ':nocontext (car body))
|
|
193 (setq nocontext t)
|
|
194 (setq body (cdr body)))
|
|
195 ((eq ':outer (car body))
|
|
196 (setq outer t)
|
|
197 (setq body (cdr body)))))
|
|
198 (setq condition (if bad-style
|
|
199 '(eq 'style w3-debug-html)
|
|
200 'w3-debug-html))
|
|
201 (if mandatory-if
|
|
202 (setq condition
|
|
203 (` (or (, mandatory-if)
|
|
204 (, condition)))))
|
|
205 (` (if (, condition)
|
|
206 (let ((message (progn (,@ body))))
|
|
207 (if message
|
|
208 (w3-debug-html-aux message
|
|
209 (,@ (if nocontext
|
|
210 (list outer nocontext)
|
|
211 (if outer '(t)))))))))))
|
|
212
|
|
213 ;; This is unsatisfactory.
|
|
214 (put 'w3-debug-html 'lisp-indent-function 0)
|
|
215
|
|
216 (put 'w3-debug-html 'edebug-form-spec
|
|
217 '([&rest &or ":nocontext" ":outer" [":mandatory-if" form] ":bad-style"]
|
|
218 &rest form))
|
|
219 )
|
|
220
|
|
221 (defun w3-debug-html-aux (message &optional outer nocontext)
|
|
222 (let (
|
|
223 ;; We have already determined whether the user should see the
|
|
224 ;; message, so don't let w3-warn suppress it.
|
|
225 (w3-debug-html t))
|
|
226 ;; Print the URL before the first error message for a document.
|
|
227 (cond (w3-p-d-debug-url
|
|
228 (let ((url (url-view-url t)))
|
|
229 (w3-warn 'html
|
|
230 (if (or (null url)
|
|
231 (string-equal "" url))
|
|
232 (format "HTML errors for buffer %s"
|
|
233 (current-buffer))
|
|
234 (format "HTML errors for <URL:%s>" url))))
|
|
235 (setq w3-p-d-debug-url nil)))
|
|
236 (w3-warn 'html
|
|
237 (if nocontext
|
|
238 message
|
|
239 (concat message
|
|
240 ;; Display context information for each error
|
|
241 ;; message.
|
|
242 "\n Containing elements: "
|
|
243 (w3-open-elements-string (if outer 1))
|
|
244 (concat
|
|
245 "\n Text around error: "
|
|
246 (save-restriction
|
|
247 (widen)
|
|
248 (progn
|
|
249 (insert "*ERROR*")
|
|
250 (prog1
|
|
251 (w3-quote-for-string
|
|
252 (buffer-substring
|
|
253 (max (- (point) 27) (point-min))
|
|
254 (min (+ (point) 20) (point-max))))
|
|
255 (delete-char -7))))))))))
|
|
256
|
|
257 (defun w3-quote-for-string (string)
|
|
258 (save-excursion
|
|
259 (set-buffer (get-buffer-create " w3-quote-whitespace"))
|
|
260 (erase-buffer)
|
|
261 (insert string)
|
|
262 (goto-char (point-min))
|
|
263 (insert "\"")
|
|
264 (while (progn
|
|
265 (skip-chars-forward "^\"\\\t\n\r")
|
|
266 (not (eobp)))
|
70
|
267 (insert "\\" (cdr (assq (following-char) '((?\" . "\"")
|
|
268 (?\\ . "\\")
|
|
269 (?\t . "t")
|
|
270 (?\n . "n")
|
|
271 (?\r . "r")))))
|
0
|
272 (delete-char 1))
|
|
273 (insert "\"")
|
|
274 (buffer-string)))
|
|
275
|
|
276
|
|
277 ;;;
|
|
278 ;;; General entity references and numeric character references.
|
|
279 ;;;
|
|
280
|
|
281 ;; *** MULE conversion?
|
|
282 ;; *** I18N HTML support?
|
|
283
|
|
284 (let ((html-entities w3-html-entities))
|
|
285 (while html-entities
|
|
286 (put (car (car html-entities)) 'html-entity-expansion
|
|
287 (cons 'CDATA (if (integerp (cdr (car html-entities)))
|
2
|
288 (char-to-string
|
80
|
289 (mule-make-iso-character (cdr (car html-entities))))
|
0
|
290 (cdr (car html-entities)))))
|
|
291 (setq html-entities (cdr html-entities))))
|
|
292
|
|
293 ;; These are handled differently than the normal HTML entities because
|
|
294 ;; we need to define the entities with 'nil instead of 'CDATA so
|
|
295 ;; that they are correctly scanned for new markup.
|
|
296 ;;
|
|
297 ;; from jbw@cs.bu.edu
|
|
298 ;;
|
|
299 ;;> Of course, this differs from the specification a bit. The W3C tech
|
|
300 ;;> report defines all of these as SYSTEM entities. This potentially means
|
|
301 ;;> that they can be used in more contexts. The method I outlined above
|
|
302 ;;> means "&smiley;" can only be used in contexts where IMG is a valid
|
|
303 ;;> element. I am not sure exactly where it is okay to use a SYSTEM entity.
|
|
304 ;;> I think anywhere that data characters are accepted.
|
|
305 ;;
|
|
306 ;; I find this acceptable, as just what the hell are you supposed to do with
|
|
307 ;; &computer; as part of a value of a form input when you display it and/or
|
|
308 ;; submit it?!
|
|
309
|
|
310 (let ((html-entities w3-graphic-entities)
|
|
311 (cur nil))
|
|
312 (while html-entities
|
|
313 (setq cur (car html-entities)
|
|
314 html-entities (cdr html-entities))
|
|
315 (put (nth 0 cur) 'html-entity-expansion
|
2
|
316 (cons 'nil (format "<img src=\"%s/%s%s\" alt=\"%s\">"
|
0
|
317 w3-icon-directory
|
|
318 (nth 1 cur)
|
|
319 (if w3-icon-format
|
|
320 (concat "." (symbol-name w3-icon-format))
|
|
321 "")
|
|
322 (or (nth 3 cur) (nth 2 cur)))))))
|
|
323
|
|
324 ;; These are the general entities in HTML 3.0 in terms of which the math
|
|
325 ;; shortrefs are defined:
|
|
326 ;;
|
|
327 ;; <!ENTITY REF1 STARTTAG "SUP">
|
|
328 ;; <!ENTITY REF2 ENDTAG "SUP">
|
|
329 ;; <!ENTITY REF3 STARTTAG "SUB">
|
|
330 ;; <!ENTITY REF4 ENDTAG "SUB">
|
|
331 ;; <!ENTITY REF5 STARTTAG "BOX">
|
|
332 ;; <!ENTITY REF6 ENDTAG "BOX">
|
|
333 ;;
|
|
334 ;; We're ignoring them because these names should really be local to the
|
|
335 ;; DTD and not visible in the document. They might change at any time in
|
|
336 ;; future HTML standards.
|
|
337
|
|
338 ;; <!--Entities for language-dependent presentation (BIDI and contextual analysis) -->
|
|
339 ;; <!ENTITY zwnj CDATA "‌"-- zero width non-joiner-->
|
|
340 ;; <!ENTITY zwj CDATA "‍"-- zero width joiner-->
|
|
341 ;; <!ENTITY lrm CDATA "‎"-- left-to-right mark-->
|
|
342 ;; <!ENTITY rlm CDATA "‏"-- right-to-left mark-->
|
|
343
|
|
344 ;; Entity names are case sensitive!
|
|
345
|
|
346 ;; & should only be recognized when followed by letter or # and
|
|
347 ;; digit or # and letter.
|
|
348
|
|
349 (eval-when-compile
|
|
350
|
|
351 (w3-p-s-var-def w3-p-s-entity)
|
|
352 (w3-p-s-var-def w3-p-s-pos)
|
|
353 (w3-p-s-var-def w3-p-s-num)
|
|
354 ;; Destroys free variables:
|
|
355 ;; w3-p-s-entity, w3-p-s-pos, w3-p-s-num
|
|
356 ;; Depends on case-fold-search being t.
|
|
357 (defsubst w3-expand-entity-at-point-maybe ()
|
|
358 ;; We are looking at a &.
|
|
359 ;; Only &A or  or &#A syntax is special.
|
|
360 (cond
|
|
361 ((and (looking-at "&\\([a-z][-a-z0-9.]*\\)[\ ;\n]?") ; \n should be \r
|
|
362 ;; We are looking at a general entity reference, maybe undefined.
|
|
363 (setq w3-p-s-entity
|
|
364 (get
|
|
365 (intern (buffer-substring (match-beginning 1) (match-end 1)))
|
|
366 'html-entity-expansion)))
|
|
367
|
|
368 ;; If the reference was undefined, then for SGML, we should really
|
|
369 ;; issue a warning and delete the reference. However, the HTML
|
|
370 ;; standard (contradicting the SGML standard) says to leave the
|
|
371 ;; undefined reference in the text.
|
|
372
|
|
373 ;; We are looking at a defined general entity reference.
|
|
374 (replace-match "")
|
|
375 (cond ((eq 'CDATA (car w3-p-s-entity))
|
|
376 ;; Leave point after expansion so we don't rescan it.
|
|
377 (insert (cdr w3-p-s-entity)))
|
|
378 ((memq (car w3-p-s-entity) '(nil STARTTAG ENDTAG MS MD))
|
|
379 ;; nil is how I mark ordinary entities.
|
|
380 ;; The replacement text gets rescanned for all of these.
|
|
381 (setq w3-p-s-pos (point))
|
|
382 (insert (cdr (assq (car w3-p-s-entity)
|
|
383 '((nil . "")
|
|
384 (STARTTAG . "<")
|
|
385 (ENDTAG . "</")
|
|
386 (MS . "<![")
|
|
387 (MD . "<!"))))
|
|
388 (cdr w3-p-s-entity)
|
|
389 (cdr (assq (car w3-p-s-entity)
|
|
390 '((nil . "")
|
|
391 (STARTTAG . ">")
|
|
392 (ENDTAG . ">")
|
|
393 (MS . "]]>")
|
|
394 (MD . ">")))))
|
|
395 (goto-char w3-p-s-pos)
|
|
396 ;; *** Strictly speaking, if we parse anything from the
|
|
397 ;; replacement text, it must end before the end of the
|
|
398 ;; replacement text.
|
|
399 )
|
|
400 ((eq 'SDATA (car w3-p-s-entity))
|
|
401 (insert "[Unimplemented SDATA \"%s\"]" (cdr w3-p-s-entity)))
|
|
402 ((eq 'PI (car w3-p-s-entity))
|
|
403 ;; We are currently ignoring processing instructions.
|
|
404 ;; *** Strictly speaking, we should issue a warning if this
|
|
405 ;; occurs in a attribute value.
|
|
406 )
|
|
407 (t
|
|
408 ;; *** We don't handle external entities yet.
|
|
409 (error "[Unimplemented entity: \"%s\"]" w3-p-s-entity))))
|
|
410
|
|
411 ((looking-at "&#[0-9][0-9]*\\([\ ;\n]?\\)") ; \n should be \r
|
|
412 ;; We are looking at a numeric character reference.
|
|
413 ;; Ensure the number is already terminated by a semicolon or carriage
|
|
414 ;; return so we can use "read" to get it as a number quickly.
|
|
415 (cond ((= (match-beginning 1) (match-end 1))
|
|
416 ;; This is very uncommon, so we don't have to be quick here but
|
|
417 ;; rather correct.
|
|
418 (save-excursion
|
|
419 (goto-char (match-end 0)) ; same as match-end 1
|
|
420 (insert ?\;))
|
|
421 ;; Set up the match data properly
|
|
422 (looking-at "&#[0-9][0-9]*;")))
|
|
423 (forward-char 2)
|
|
424 (setq w3-p-s-num (read (current-buffer)))
|
|
425 ;; Always leave point after the expansion of a numeric
|
|
426 ;; character reference, like it were a CDATA entity.
|
|
427 (replace-match "")
|
|
428 ;; char-to-string will hopefully do something useful with characters
|
|
429 ;; larger than 255. I think in MULE it does. Is this true?
|
|
430 ;; Bill wants to call w3-resolve-numeric-entity here, but I think
|
|
431 ;; that functionality belongs in char-to-string.
|
|
432 ;; The largest valid character in the I18N version of HTML is 65533.
|
80
|
433 ;; ftp://ds.internic.net/internet-drafts/draft-ietf-html-i18n-01.txt
|
2
|
434 ;; wrongo! Apparently, mule doesn't do sane things with char-to-string
|
|
435 ;; -wmp 7/9/96
|
|
436 (insert (char-to-string
|
80
|
437 (mule-make-iso-character w3-p-s-num))))
|
0
|
438 ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r
|
|
439 (replace-match (assq (upcase (char-after (+ 3 (point))))
|
|
440 '(;; *** Strictly speaking, record end should be
|
|
441 ;; carriage return.
|
|
442 (?E . "\n") ; RE
|
|
443 ;; *** And record start should be line feed.
|
|
444 (?S . "") ; RS
|
|
445 (?P . " ") ; SPACE
|
|
446 (?A . "\t")))) ; TAB
|
|
447 ;; Leave point after the expansion of a character reference, so it
|
|
448 ;; doesn't get rescanned.
|
|
449 ;; *** Strictly speaking, we should issue a warning for &#foo; if foo
|
|
450 ;; is not a function character in the SGML declaration.
|
|
451 )
|
|
452
|
70
|
453 ((eq ?& (following-char))
|
0
|
454 ;; We are either looking at an undefined reference or a & that does
|
|
455 ;; not start a reference (in which case we should not have been called).
|
|
456 ;; Skip over the &.
|
|
457 (forward-char 1))
|
|
458
|
|
459 (t
|
|
460 ;; What is the code doing calling us if we're not looking at a "&"?
|
|
461 (error "this should never happen"))))
|
|
462
|
|
463 )
|
|
464
|
|
465
|
|
466 ;;;
|
|
467 ;;; Syntax table used in markup declarations.
|
|
468 ;;;
|
|
469
|
|
470 (defvar w3-sgml-md-syntax-table
|
|
471 (let ((table (make-syntax-table))
|
|
472 (items '(
|
|
473 (0 "." 255) ; clear everything
|
|
474 (?\r " ")
|
|
475 (?\t " ")
|
|
476 (?\n " ")
|
|
477 (32 " ") ; space
|
|
478 (?< "\(>")
|
|
479 (?> "\)<")
|
|
480 (?\( "\(\)")
|
|
481 (?\) "\)\(")
|
|
482 (?\[ "\(\]")
|
|
483 (?\] "\)\[")
|
|
484 (?\" "\"")
|
|
485 (?\' "\"")
|
|
486 (?a "w" ?z)
|
|
487 (?A "w" ?Z)
|
|
488 (?0 "w" ?9)
|
|
489 (?. "w")
|
|
490 ;; "-" can be a character in a NAME, but it is also used in
|
|
491 ;; "--" as both a comment start and end within SGML
|
|
492 ;; declarations ("<!" ... ">"). In HTML, it is only used
|
|
493 ;; as a NAME character in the parameter entities
|
|
494 ;; Content-Type, HTTP-Method, and style-notations and in
|
|
495 ;; the attribute name http-equiv and in the notation names
|
|
496 ;; dsssl-lite and w3c-style. We would like to be able to
|
|
497 ;; train Emacs to skip over these kinds of comments with
|
|
498 ;; forward-sexp and backward-sexp. Is there any way to
|
|
499 ;; teach Emacs how to do this? It doesn't seem to be the
|
|
500 ;; case.
|
|
501 (?- "w")
|
|
502 )))
|
|
503 (while items
|
|
504 (let* ((item (car items))
|
|
505 (char (car item))
|
|
506 (syntax (car (cdr item)))
|
|
507 (bound (or (car-safe (cdr-safe (cdr item)))
|
|
508 char)))
|
|
509 (while (<= char bound)
|
|
510 (modify-syntax-entry char syntax table)
|
|
511 (setq char (1+ char))))
|
|
512 (setq items (cdr items)))
|
|
513 table)
|
|
514 "A syntax table for parsing SGML markup declarations.")
|
|
515
|
|
516
|
|
517 ;;;
|
|
518 ;;; Element information data type.
|
|
519 ;;;
|
|
520
|
|
521 ;; The element information data type is used in two ways:
|
|
522 ;;
|
|
523 ;; * To store the DTD, there is one element record for each element in
|
|
524 ;; the DTD.
|
|
525 ;;
|
|
526 ;; * To store information for open elements in the current parse tree.
|
|
527 ;; Each such element is initialized by copying the element record
|
|
528 ;; from the DTD. This means that values in the fields can not be
|
|
529 ;; destructively altered, although of course the fields can be
|
|
530 ;; changed.
|
|
531
|
|
532 ;; The cells in this vector are:
|
|
533 ;;
|
|
534 ;; name: the element's name (a generic identifier).
|
|
535 ;;
|
|
536 ;; end-tag-name: a symbol whose name should be the result of prefixing
|
|
537 ;; the generic-identifier with a slash. This is a convenience value for
|
|
538 ;; interfacing with the display engine which expects a stream of start
|
|
539 ;; and end tags in this format rather than a tree.
|
|
540 ;;
|
|
541 ;; content-model: a data structure describing what elements or character
|
|
542 ;; data we expect to find within this element. This is either a symbol
|
|
543 ;; listed here:
|
|
544 ;;
|
|
545 ;; EMPTY: no content, no end-tag allowed.
|
|
546 ;; CDATA: all data characters until "</[a-z]" is seen.
|
|
547 ;; XCDATA: special non-SGML-standard mode which includes all data
|
|
548 ;; characters until "</foo" is seen where "foo" is the name of this
|
|
549 ;; element. (for XMP and LISTING)
|
|
550 ;; XXCDATA: special non-SGML-standard mode which includes all data
|
|
551 ;; until end-of-entity (end-of-buffer for us). (for PLAINTEXT)
|
|
552 ;; RCDATA: all data characters until "</[a-z]" is seen, except that
|
|
553 ;; entities are expanded first, although the expansions are not
|
|
554 ;; scanned for end-tags.
|
|
555 ;; XINHERIT: special non-SGML-standard mode which means to use the
|
|
556 ;; content model of the containing element instead.
|
|
557 ;;
|
|
558 ;; or a vector of this structure:
|
|
559 ;;
|
|
560 ;; [(INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT) ...]
|
|
561 ;;
|
|
562 ;; where INCLUDES is of the format:
|
|
563 ;;
|
|
564 ;; (TAG ...)
|
|
565 ;;
|
|
566 ;; where each TRANSITION is one of these:
|
|
567 ;;
|
|
568 ;; (ACTION NEW-STATE ERRORP)
|
|
569 ;; (ACTION NEW-STATE)
|
|
570 ;; (ACTION)
|
|
571 ;;
|
|
572 ;; where DEFAULT is one of these:
|
|
573 ;;
|
|
574 ;; nil or TRANSITION
|
|
575 ;;
|
|
576 ;; where the meaning of the components is:
|
|
577 ;;
|
|
578 ;; INCLUDES is a list of tags for which the transition (*include *same
|
|
579 ;; nil) applies.
|
|
580 ;;
|
|
581 ;; DEFAULT if non-nil is a transition that should be taken when
|
|
582 ;; matching any possibility not explicitly listed in another
|
|
583 ;; TRANSITION, except for data characters containing only whitespace.
|
|
584 ;;
|
|
585 ;; INCSPACEP specifies how to handle data characters which include
|
|
586 ;; only whitespace characters. The value is non-nil to indicate
|
|
587 ;; (*include *same nil) or nil to indicate (*discard *same nil).
|
|
588 ;;
|
|
589 ;; TAG is a symbol corresponding to the start-tag we are looking at,
|
|
590 ;; or *data when seeing character data that includes at least one
|
|
591 ;; non-space character.
|
|
592 ;;
|
|
593 ;; ACTION is one of:
|
|
594 ;; *close: Close this element and try again using content model of
|
|
595 ;; enclosing element. (Note that this does not apply to the
|
|
596 ;; case of an element being closed by its own end-tag.)
|
|
597 ;; *include: Process new element as subelement of this one or
|
|
598 ;; include data characters directly.
|
|
599 ;; *discard: Discard a start-tag or data characters.
|
|
600 ;; *retry: Try again after processing NEW-STATE and ERRORP.
|
|
601 ;; ELEMENT: Open ELEMENT (with default attributes), then try again
|
|
602 ;; using its content model.
|
|
603 ;;
|
|
604 ;; NEW-STATE (optional, default *same) is the index of the state to
|
|
605 ;; move to after processing the element or one of these:
|
|
606 ;; *same: no state change occurs.
|
2
|
607 ;; *next: change the current state + 1.
|
0
|
608 ;; The initial state is 0. NEW-STATE does not matter if ACTION is
|
|
609 ;; *close.
|
|
610 ;;
|
|
611 ;; ERRORP (optional, default nil) if non-nil indicates this transition
|
|
612 ;; represents an error. The error message includes this value if it
|
|
613 ;; is a string.
|
|
614 ;;
|
|
615 ;; If no matching transition is found, the default transition is
|
|
616 ;; (*discard *same "not allowed here").
|
|
617 ;;
|
|
618 ;; overrides: An alist of pairs of the form (VAR REPLACEP . VALUE).
|
|
619 ;; When this element is opened, the old value of VAR is saved in the
|
|
620 ;; undo-list. If REPLACEP is non-nil, then VAR gets value VALUE,
|
|
621 ;; otherwise VAR gets value (append VALUE (symbol-value VAR)). Useful
|
|
622 ;; values for VAR are:
|
|
623 ;;
|
|
624 ;; w3-p-d-exceptions: See doc string.
|
|
625 ;;
|
|
626 ;; w3-p-d-shortrefs: See doc string.
|
|
627 ;;
|
|
628 ;; w3-p-d-shortref-chars: See doc string.
|
|
629 ;;
|
|
630 ;; end-tag-omissible: Whether it is legal to omit the end-tag of this
|
|
631 ;; element. If an end-tag is inferred for an element whose end tag is
|
|
632 ;; not omissible, an error message is given.
|
|
633 ;;
|
|
634 ;; state: The current state in the content model. Preset to the initial
|
|
635 ;; state of 0.
|
|
636 ;;
|
|
637 ;; undo-list: an alist of of former values of local variables
|
|
638 ;; of w3-parse-buffer to restore upon closing this element. Each
|
|
639 ;; item on the list is of the format (VAR . VALUE-TO-RESTORE).
|
|
640 ;;
|
|
641 ;; attributes: an alist of attributes and values. Each item on
|
|
642 ;; this list is of the format (ATTRIBUTE-NAME . VALUE). Each
|
|
643 ;; ATTRIBUTE-NAME is a symbol and each attribute value is a
|
|
644 ;; string.
|
|
645 ;;
|
|
646 ;; content: a list of the accumulated content of the element. While the
|
|
647 ;; element is open, the list is in order from latest to earliest,
|
|
648 ;; otherwise it is in order from earliest to latest. Each member is
|
|
649 ;; either a string of data characters or a list of the form (NAME
|
|
650 ;; ATTRIBUTES CONTENT), where NAME is the subelement's name, ATTRIBUTES
|
|
651 ;; is an alist of the subelement's attribute names (lowercase symbols)
|
|
652 ;; and their values (strings), and CONTENT is the subelement's content.
|
|
653
|
|
654 (eval-when-compile
|
|
655
|
|
656 (defconst w3-element-fields
|
|
657 '(name end-tag-name content-model state overrides undo-list
|
|
658 content attributes end-tag-omissible deprecated))
|
|
659
|
|
660 (let* ((fields w3-element-fields)
|
|
661 (index (1- (length fields))))
|
|
662 (while fields
|
|
663 (let* ((field (symbol-name (car fields)))
|
|
664 (get-sym (intern (concat "w3-element-" field)))
|
|
665 (set-sym (intern (concat "w3-set-element-" field))))
|
|
666 (eval (` (progn
|
|
667 (defmacro (, get-sym) (element)
|
|
668 (list 'aref element (, index)))
|
|
669 (defmacro (, set-sym) (element value)
|
|
670 (list 'aset element (, index) value))))))
|
|
671 (setq fields (cdr fields))
|
|
672 (setq index (1- index))))
|
|
673
|
|
674 (defmacro w3-make-element ()
|
|
675 (list 'make-vector (length w3-element-fields) nil))
|
|
676
|
|
677 ;; *** move this to be with DTD declaration.
|
|
678 (defmacro w3-fresh-element-for-tag (tag)
|
|
679 (` (copy-sequence
|
|
680 (or (get (, tag) 'html-element-info)
|
|
681 (error "unimplemented element %s"
|
|
682 (w3-sgml-name-to-string (, tag)))))))
|
|
683
|
|
684 ;; *** move this to be with DTD declaration.
|
|
685 (defmacro w3-known-element-p (tag)
|
|
686 (` (get (, tag) 'html-element-info)))
|
|
687
|
|
688 (defsubst w3-sgml-name-to-string (sym)
|
|
689 (upcase (symbol-name sym)))
|
|
690
|
|
691 )
|
|
692
|
|
693
|
|
694 ;;;
|
|
695 ;;; Parse tree manipulation.
|
|
696 ;;;
|
|
697
|
|
698 ;; ;; Find the name of the previous element or a substring of the
|
|
699 ;; ;; preceding data characters.
|
|
700 ;; (let ((content (w3-element-content (car stack))))
|
|
701 ;; (while content
|
|
702 ;; (cond
|
|
703 ;; ((and (stringp (car content))
|
|
704 ;; (not (string-match "\\`[ \t\n\r]*\\'" (car content))))
|
|
705 ;; (setq prior-item (car content))
|
|
706 ;; ;; Trim trailing whitespace
|
|
707 ;; (if (string-match "\\(.*[^ \t\n\r]\\)[ \t\n\r]*\\'" prior-item)
|
|
708 ;; (setq prior-item (substring prior-item 0 (match-end 1))))
|
|
709 ;; (if (> (length prior-item) 8)
|
|
710 ;; (setq prior-item (concat "..." (substring prior-item -8))))
|
|
711 ;; (setq prior-item (w3-quote-for-string prior-item))
|
|
712 ;; (setq prior-item (concat "\(after " prior-item "\)"))
|
|
713 ;; (setq content nil))
|
|
714 ;; ((and (consp (car content))
|
|
715 ;; (symbolp (car (car content))))
|
|
716 ;; (setq prior-item
|
|
717 ;; (concat "\(after "
|
|
718 ;; (w3-sgml-name-to-string (car (car content)))
|
|
719 ;; "\)"))
|
|
720 ;; (setq content nil))
|
|
721 ;; (t
|
|
722 ;; (setq content (cdr content))))))
|
|
723
|
|
724 ;; Only used for HTML debugging.
|
|
725 (defun w3-open-elements-string (&optional skip-count)
|
|
726 (let* ((stack (nthcdr (or skip-count 0)
|
|
727 (cons w3-p-d-current-element
|
|
728 w3-p-d-open-element-stack)))
|
|
729 ;;(prior-item "(at start)")
|
|
730 result)
|
|
731 ;; Accumulate the names of the enclosing elements.
|
|
732 (while stack
|
|
733 (let ((element (w3-element-name (car stack))))
|
|
734 (if (eq '*holder element)
|
|
735 nil
|
|
736 ;; Only include *DOCUMENT if there are no other elements.
|
|
737 (if (or (not (eq '*document element))
|
|
738 (null result))
|
|
739 (setq result (cons (w3-sgml-name-to-string element)
|
|
740 result)))))
|
|
741 (setq stack (cdr stack)))
|
|
742 (setq result (mapconcat 'identity result ":"))
|
|
743 (if result
|
|
744 ;;(concat
|
|
745 result
|
|
746 ;; prior-item)
|
|
747 "[nowhere!]")))
|
|
748
|
|
749 ;; *** This doesn't really belong here, but where?
|
|
750 (eval-when-compile
|
|
751 (defmacro w3-invalid-sgml-chars ()
|
|
752 "Characters not allowed in an SGML document using the reference
|
|
753 concrete syntax (i.e. HTML). Returns a string in the format expected by
|
|
754 skip-chars-forward."
|
|
755 "\000-\010\013\014\016-\037\177-\237"))
|
|
756
|
|
757 (eval-when-compile
|
|
758 ;; Uses:
|
|
759 ;; w3-p-d-null-end-tag-enabled, w3-p-d-in-parsed-marked-section,
|
|
760 ;; w3-p-d-shortref-chars
|
|
761 ;; Modifies free variable:
|
|
762 ;; w3-p-d-non-markup-chars
|
|
763 (defsubst w3-update-non-markup-chars ()
|
|
764 (setq w3-p-d-non-markup-chars
|
|
765 (concat "^&<"
|
|
766 (w3-invalid-sgml-chars)
|
|
767 (if w3-p-d-null-end-tag-enabled "/" "")
|
|
768 (if w3-p-d-in-parsed-marked-section "]" "")
|
|
769 (or w3-p-d-shortref-chars ""))))
|
|
770 )
|
|
771
|
|
772 (eval-when-compile
|
|
773 (w3-p-s-var-def w3-p-s-overrides)
|
|
774 (w3-p-s-var-def w3-p-s-undo-list)
|
|
775 (w3-p-s-var-def w3-p-s-var)
|
|
776 ;; Uses free variables:
|
|
777 ;; w3-p-d-non-markup-chars
|
|
778 ;; Modifies free variables:
|
|
779 ;; w3-p-d-current-element, w3-p-d-open-element-stack
|
|
780 ;; Destroys free variables:
|
|
781 ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var
|
|
782 (defsubst w3-open-element (tag attributes)
|
|
783
|
|
784 ;; Push new element on stack.
|
|
785 (setq w3-p-d-open-element-stack (cons w3-p-d-current-element
|
|
786 w3-p-d-open-element-stack))
|
|
787 (setq w3-p-d-current-element (w3-fresh-element-for-tag tag))
|
|
788
|
|
789 ;; Warn if deprecated or obsolete.
|
|
790 (if (w3-element-deprecated w3-p-d-current-element)
|
|
791 (w3-debug-html :outer
|
|
792 (format "%s element %s."
|
|
793 (if (eq 'obsolete
|
|
794 (w3-element-deprecated w3-p-d-current-element))
|
|
795 "Obsolete"
|
|
796 "Deprecated")
|
|
797 (w3-sgml-name-to-string
|
|
798 (w3-element-name w3-p-d-current-element)))))
|
|
799
|
|
800 ;; Store attributes.
|
|
801 ;; *** we are not handling #CURRENT attributes (HTML has none).
|
|
802 (w3-set-element-attributes w3-p-d-current-element attributes)
|
|
803 ;; *** Handle default attribute values.
|
|
804 ;; *** Fix the attribute name for unnamed values. Right now they will
|
|
805 ;; be in the attribute list as items of the format (VALUE . VALUE) where
|
|
806 ;; both occurrences of VALUE are the same. The first one needs to be
|
|
807 ;; changed to the proper attribute name by consulting the DTD.
|
|
808 ;; ********************
|
|
809
|
|
810 ;; Handle syntax/semantics overrides of new current element.
|
|
811 (cond ((w3-element-overrides w3-p-d-current-element)
|
|
812 (setq w3-p-s-overrides
|
|
813 (w3-element-overrides w3-p-d-current-element))
|
|
814 (setq w3-p-s-undo-list nil)
|
|
815 (while w3-p-s-overrides
|
|
816 (setq w3-p-s-var (car (car w3-p-s-overrides)))
|
|
817 (setq w3-p-s-undo-list
|
|
818 (cons (cons w3-p-s-var
|
|
819 (symbol-value w3-p-s-var))
|
|
820 w3-p-s-undo-list))
|
|
821 (set w3-p-s-var (if (car (cdr (car w3-p-s-overrides)))
|
|
822 (cdr (cdr (car w3-p-s-overrides)))
|
|
823 (append (cdr (cdr (car w3-p-s-overrides)))
|
|
824 (symbol-value w3-p-s-var))))
|
|
825 ;; *** HACK HACK.
|
|
826 ;; Magic handling of w3-p-d-shortref-chars.
|
|
827 (cond ((eq 'w3-p-d-shortref-chars w3-p-s-var)
|
|
828 (setq w3-p-s-undo-list
|
|
829 (cons (cons 'w3-p-d-non-markup-chars
|
|
830 w3-p-d-non-markup-chars)
|
|
831 w3-p-s-undo-list))
|
|
832 (w3-update-non-markup-chars)))
|
|
833 (setq w3-p-s-overrides (cdr w3-p-s-overrides)))
|
|
834 (w3-set-element-undo-list w3-p-d-current-element
|
|
835 w3-p-s-undo-list)))
|
|
836
|
|
837 ;; Handle content-model inheritance. (Very non-SGML!)
|
|
838 (if (eq 'XINHERIT (w3-element-content-model w3-p-d-current-element))
|
|
839 (w3-set-element-content-model
|
|
840 w3-p-d-current-element
|
|
841 (w3-element-content-model (car w3-p-d-open-element-stack))))
|
|
842
|
80
|
843 )
|
0
|
844 )
|
|
845
|
|
846 ;; The protocol for handing items to the display engine is as follows.
|
|
847 ;;
|
|
848 ;; For an element, send (START-TAG . ATTS), each member of the content,
|
|
849 ;; and (END-TAG . nil) if the element is allowed to have an end tag.
|
|
850 ;;
|
|
851 ;; For data characters, send (text . DATA-CHARACTERS).
|
|
852 ;;
|
|
853 ;; Exceptions:
|
|
854 ;;
|
|
855 ;; For PLAINTEXT, STYLE, XMP, TEXTAREA send:
|
|
856 ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTS)).
|
|
857 ;;
|
|
858 ;; *** This requires somehow eliminating any subelements of the TEXTAREA
|
|
859 ;; element. TEXTAREA can contain subelements in HTML 3.0.
|
|
860 ;;
|
|
861 ;; For LISTING, send (text . DATA-CHARACTERS). (Is this really correct or
|
|
862 ;; is this perhaps a bug in the old parser?) I'm ignoring this for now.
|
|
863
|
|
864 (eval-when-compile
|
|
865 (w3-p-s-var-def w3-p-s-undo-list)
|
|
866 (w3-p-s-var-def w3-p-s-content)
|
|
867 (w3-p-s-var-def w3-p-s-end-tag)
|
|
868 ;; Modifies free variables:
|
|
869 ;; w3-p-d-current-element, w3-p-d-open-element-stack
|
|
870 ;; Accesses free variables:
|
|
871 ;; w3-p-d-tag-name, w3-p-d-end-tag-p
|
|
872 ;; Destroys free variables:
|
|
873 ;; w3-p-s-undo-list, w3-p-s-content, w3-p-s-end-tag
|
|
874 (defsubst w3-close-element (&optional inferred)
|
|
875 ;; inferred: non-nil if the end-tag of the current element is being
|
|
876 ;; inferred due to the presence of content not allowed in the current
|
|
877 ;; element. If t, then the tag causing this is in w3-p-d-tag-name and
|
|
878 ;; w3-p-d-end-tag-p.
|
|
879 ;; (OLD: ... otherwise it is a symbol indicating the start-tag
|
|
880 ;; of an element or *data or *space indicating data characters.)
|
|
881
|
|
882 (cond ((and inferred
|
|
883 (not (w3-element-end-tag-omissible w3-p-d-current-element)))
|
|
884 (w3-debug-html
|
|
885 (format "</%s> end-tag not omissible (required due to %s)"
|
|
886 (w3-sgml-name-to-string
|
|
887 (w3-element-name w3-p-d-current-element))
|
|
888 (cond ((eq t inferred)
|
|
889 (format (if w3-p-d-end-tag-p
|
|
890 "</%s> end-tag"
|
|
891 "start-tag for %s")
|
|
892 (w3-sgml-name-to-string
|
|
893 w3-p-d-tag-name)))
|
|
894 ;; *** Delete this functionality?
|
|
895 ((memq inferred '(*space *data))
|
|
896 "data characters")
|
|
897 ((symbolp inferred)
|
|
898 (format "start-tag for %s"
|
|
899 (w3-sgml-name-to-string inferred)))
|
|
900 )))))
|
|
901
|
|
902 ;; Undo any variable bindings of this element.
|
|
903 (cond ((w3-element-undo-list w3-p-d-current-element)
|
|
904 (setq w3-p-s-undo-list
|
|
905 (w3-element-undo-list w3-p-d-current-element))
|
|
906 (while w3-p-s-undo-list
|
|
907 (set (car (car w3-p-s-undo-list))
|
|
908 (cdr (car w3-p-s-undo-list)))
|
|
909 (setq w3-p-s-undo-list (cdr w3-p-s-undo-list)))))
|
|
910
|
|
911 (setq w3-p-s-end-tag
|
|
912 (w3-element-end-tag-name w3-p-d-current-element))
|
|
913
|
|
914 ;; Fix up the content of the current element in preparation for putting
|
|
915 ;; it in the parent.
|
|
916 ;; Remove trailing newline from content, if there is one, otherwise send
|
|
917 ;; any trailing data character item to display engine.
|
|
918 (setq w3-p-s-content (w3-element-content w3-p-d-current-element))
|
|
919 (cond ((null w3-p-s-content))
|
|
920 ((equal "\n" (car w3-p-s-content))
|
|
921 (setq w3-p-s-content (cdr w3-p-s-content)))
|
80
|
922 )
|
0
|
923
|
80
|
924 (cond ;; *** Handle LISTING the way the old parser did.
|
0
|
925 ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element))
|
|
926 ;; Do nothing, can't have an end tag.
|
|
927 )
|
|
928 (t
|
|
929 ;; Normal case.
|
|
930 (if (null w3-p-s-content)
|
|
931 (w3-debug-html
|
|
932 :bad-style :outer
|
|
933 ;; Don't warn for empty TD elements or empty A elements
|
|
934 ;; with no HREF attribute.
|
|
935 ;; *** Crude hack that should really be encoded in the
|
|
936 ;; element database somehow.
|
|
937 (if (or (not (memq (w3-element-name w3-p-d-current-element)
|
|
938 '(a td)))
|
|
939 (assq 'href
|
|
940 (w3-element-attributes w3-p-d-current-element)))
|
|
941 (format "Empty %s element."
|
|
942 (w3-sgml-name-to-string
|
|
943 (w3-element-name w3-p-d-current-element))))))))
|
|
944
|
|
945 ;; Put the current element in the proper place in its parent.
|
|
946 ;; This will cause an error if we overpop the stack.
|
|
947 (w3-set-element-content
|
|
948 (car w3-p-d-open-element-stack)
|
|
949 (cons (list (w3-element-name w3-p-d-current-element)
|
|
950 (w3-element-attributes w3-p-d-current-element)
|
|
951 (nreverse w3-p-s-content))
|
|
952 (w3-element-content (car w3-p-d-open-element-stack))))
|
|
953
|
|
954 ;; Pop the stack.
|
|
955 (setq w3-p-d-current-element (car w3-p-d-open-element-stack))
|
|
956 (setq w3-p-d-open-element-stack (cdr w3-p-d-open-element-stack)))
|
|
957
|
|
958 )
|
|
959
|
|
960
|
|
961 ;;;
|
|
962 ;;; A pseudo-DTD for HTML.
|
|
963 ;;;
|
|
964
|
|
965 (eval-when-compile
|
|
966 ;; This works around the following bogus compiler complaint:
|
|
967 ;; While compiling the end of the data in file w3-parse.el:
|
|
968 ;; ** the function w3-expand-parameters is not known to be defined.
|
|
969 ;; This is a bogus error. Anything of this form will trigger this message:
|
|
970 ;; (eval-when-compile (defun xyzzy () (xyzzy)))
|
|
971 (defun w3-expand-parameters (pars data) nil))
|
|
972
|
|
973 (eval-when-compile
|
|
974 (defun w3-expand-parameters (pars data)
|
|
975 (cond ((null data)
|
|
976 nil)
|
|
977 ((consp data)
|
|
978 ;; This has to be written carefully to avoid exceeding the
|
|
979 ;; maximum lisp function call nesting depth.
|
|
980 (let (result)
|
|
981 (while (consp data)
|
|
982 (let ((car-exp (w3-expand-parameters pars (car data))))
|
|
983 (setq result
|
|
984 (if (and (symbolp (car data))
|
|
985 (not (eq car-exp (car data)))
|
|
986 ;; An expansion occurred.
|
|
987 (listp car-exp))
|
|
988 ;; The expansion was a list, which we splice in.
|
|
989 (condition-case err
|
|
990 (append (reverse car-exp) result)
|
|
991 (wrong-type-argument
|
|
992 (if (eq 'listp (nth 1 err))
|
|
993 ;; Wasn't really a "list" since the last
|
|
994 ;; cdr wasn't nil, so don't try to splice
|
|
995 ;; it in.
|
|
996 (cons car-exp result)
|
|
997 (signal (car err) (cdr err)))))
|
|
998 (cons car-exp result))))
|
|
999 (setq data (cdr data)))
|
|
1000 (append (nreverse result)
|
|
1001 (w3-expand-parameters pars data))))
|
|
1002 ((symbolp data)
|
|
1003 (let ((sym-exp (cdr-safe (assq data pars))))
|
|
1004 (if sym-exp
|
|
1005 (w3-expand-parameters pars sym-exp)
|
|
1006 data)))
|
|
1007 ((vectorp data)
|
|
1008 (let ((i 0)
|
|
1009 (result (copy-sequence data)))
|
|
1010 (while (< i (length data))
|
|
1011 (aset result i
|
|
1012 (w3-expand-parameters pars (aref data i)))
|
|
1013 (setq i (1+ i)))
|
|
1014 result))
|
|
1015 (t
|
|
1016 data))))
|
|
1017
|
|
1018 (eval-when-compile
|
|
1019 (defun w3-unfold-dtd (items)
|
|
1020 (let (result)
|
|
1021 (while items
|
|
1022 (let* ((item (car items))
|
|
1023 (names (car item))
|
|
1024 (content-model
|
|
1025 (or (cdr-safe (assq 'content-model item))
|
|
1026 (error "impossible")))
|
|
1027 (overrides (cdr-safe (assq 'overrides item)))
|
|
1028 (end-tag-omissible
|
|
1029 (or (cdr-safe (assq 'end-tag-omissible item))
|
|
1030 ;; *** Is this SGML standard?
|
|
1031 (eq 'EMPTY content-model)))
|
|
1032 (deprecated (cdr-safe (assq 'deprecated item)))
|
|
1033 element
|
|
1034 name)
|
|
1035 (while names
|
|
1036 (setq name (car names))
|
|
1037 (setq names (cdr names))
|
|
1038
|
|
1039 ;; Create and initialize the element information data
|
|
1040 ;; structure.
|
|
1041 (setq element (w3-make-element))
|
|
1042 (w3-set-element-name element name)
|
|
1043 (w3-set-element-end-tag-name
|
|
1044 element
|
|
1045 (intern (concat "/" (symbol-name name))))
|
|
1046 (w3-set-element-state element 0)
|
|
1047 (w3-set-element-content-model element content-model)
|
|
1048 (w3-set-element-end-tag-omissible element end-tag-omissible)
|
|
1049
|
|
1050 (or (memq deprecated '(nil t obsolete))
|
|
1051 (error "impossible"))
|
|
1052 (w3-set-element-deprecated element deprecated)
|
|
1053
|
|
1054 ;; Inclusions and exclusions are specified differently in the
|
|
1055 ;; human-coded DTD than in the format the implementation uses.
|
|
1056 ;; The human-coded version is designed to be easy to edit and to
|
|
1057 ;; work with w3-expand-parameters while the internal version is
|
|
1058 ;; designed to be fast. We have to translate here. This work
|
|
1059 ;; is repeated for every element listed in `names' so that the
|
|
1060 ;; exclusion exception error messages can be accurate.
|
|
1061 (let ((inclusions (cdr-safe (assq 'inclusions item)))
|
|
1062 (exclusions (cdr-safe (assq 'exclusions item)))
|
|
1063 (exclusion-mode '*close)
|
|
1064 (exclusion-message
|
|
1065 (format "%s exclusion" (w3-sgml-name-to-string name)))
|
|
1066 exceptions)
|
|
1067 (while inclusions
|
|
1068 (setq exceptions (cons (cons (car inclusions)
|
|
1069 '(*include *same nil))
|
|
1070 exceptions))
|
|
1071 (setq inclusions (cdr inclusions)))
|
|
1072 (while exclusions
|
|
1073 (cond ((memq (car exclusions) '(*discard *include *close))
|
|
1074 (setq exclusion-mode (car exclusions)))
|
|
1075 ((stringp (car exclusions))
|
|
1076 (setq exclusion-message (car exclusions)))
|
|
1077 (t
|
|
1078 (setq exceptions (cons (list (car exclusions)
|
|
1079 exclusion-mode
|
|
1080 '*same
|
|
1081 exclusion-message)
|
|
1082 exceptions))))
|
|
1083 (setq exclusions (cdr exclusions)))
|
|
1084 (let ((overrides (if exceptions
|
|
1085 (cons (cons 'w3-p-d-exceptions
|
|
1086 (cons nil exceptions))
|
|
1087 overrides)
|
|
1088 overrides)))
|
|
1089 (w3-set-element-overrides element overrides)))
|
|
1090
|
|
1091 (setq result (cons (cons name element) result))))
|
|
1092 (setq items (cdr items)))
|
|
1093 result)))
|
|
1094
|
|
1095 ;; Load the HTML DTD.
|
|
1096 ;; <URL:ftp://ds.internic.net/rfc/rfc1866.txt>
|
|
1097 ;; *** Be sure to incorporate rfc1867 when attribute-checking is added.
|
|
1098 ;; *** Write function to check sanity of the content-model forms.
|
|
1099 ;; *** I18N: Add Q, BDO, SPAN
|
|
1100 (mapcar
|
|
1101 (function
|
|
1102 (lambda (pair)
|
|
1103 (put (car pair) 'html-element-info (cdr pair))))
|
|
1104 ;; The purpose of this complexity is to speed up loading by
|
|
1105 ;; pre-evaluating as much as possible at compile time.
|
|
1106 (eval-when-compile
|
|
1107 (w3-unfold-dtd
|
|
1108 (w3-expand-parameters
|
|
1109 '(
|
|
1110 (%headempty . (link base meta range))
|
2
|
1111 (%headmisc . (script))
|
0
|
1112 (%head-deprecated . (nextid))
|
|
1113
|
|
1114 ;; client-side imagemaps
|
|
1115 (%imagemaps . (area map))
|
82
|
1116 (%input.fields . (input select textarea keygen label))
|
0
|
1117 ;; special action is taken for %text inside %body.content in the
|
|
1118 ;; content model of each element.
|
80
|
1119 (%body.content . (%heading %block style hr div address %imagemaps))
|
0
|
1120
|
|
1121 (%heading . (h1 h2 h3 h4 h5 h6))
|
|
1122
|
|
1123 ;; Emacs-w3 extensions
|
|
1124 (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek))
|
|
1125
|
80
|
1126 (%block . (p %list dl form %preformatted
|
0
|
1127 %blockquote isindex fn table fig note
|
82
|
1128 multicol center %block-deprecated %block-obsoleted))
|
0
|
1129 (%list . (ul ol))
|
|
1130 (%preformatted . (pre))
|
|
1131 (%blockquote . (bq))
|
|
1132 (%block-deprecated . (dir menu blockquote))
|
|
1133 (%block-obsoleted . (xmp listing))
|
|
1134
|
|
1135 ;; Why is IMG in this list?
|
|
1136 (%pre.exclusion . (*include img *discard tab math big small sub sup))
|
|
1137
|
82
|
1138 (%text . (*data b %notmath sub sup %emacsw3-crud %input.fields))
|
0
|
1139 (%notmath . (%special %font %phrase %misc))
|
80
|
1140 (%font . (i u s strike tt big small sub sup font
|
2
|
1141 roach secret wired)) ;; B left out for MATH
|
|
1142 (%phrase . (em strong dfn code samp kbd var cite blink))
|
80
|
1143 (%special . (a img applet object font basefont br script style map math tab span bdo))
|
2
|
1144 (%misc . (q lang au person acronym abbrev ins del))
|
0
|
1145
|
|
1146 (%formula . (*data %math))
|
|
1147 (%math . (box above below %mathvec root sqrt array sub sup
|
|
1148 %mathface))
|
|
1149 (%mathvec . (vec bar dot ddot hat tilde))
|
|
1150 (%mathface . (b t bt))
|
|
1151
|
|
1152 (%mathdelims . (over atop choose left right of))
|
|
1153
|
|
1154 ;; What the hell? This takes BODYTEXT????? No way!
|
|
1155 (%bq-content-model . [(nil
|
|
1156 nil
|
|
1157 (((bodytext) *include *next))
|
|
1158 (bodytext *next))
|
|
1159 (nil
|
|
1160 nil
|
|
1161 (((credit) *include *next))
|
|
1162 nil)
|
|
1163 (nil nil nil nil)
|
|
1164 ])
|
|
1165
|
|
1166 ;; non-default bad HTML handling.
|
|
1167 (%in-text-ignore . ((p %heading) *discard *same error))
|
|
1168 )
|
|
1169 '(
|
|
1170 ;; A dummy element that will contain *document.
|
|
1171 ((*holder)
|
|
1172 (content-model . [(nil nil nil nil)]))
|
|
1173 ;; The root of the parse tree. We start with a pseudo-element
|
|
1174 ;; named *document for convenience.
|
|
1175 ((*document)
|
|
1176 (content-model . [(nil nil (((html) *include *next)) (html *next))
|
|
1177 (nil
|
|
1178 nil
|
|
1179 nil
|
|
1180 (*include *same "after document end"))])
|
|
1181 (end-tag-omissible . t))
|
|
1182 ;; HTML O O (HEAD, BODY)
|
|
1183 ((html)
|
|
1184 (content-model . [(nil
|
|
1185 nil
|
|
1186 (((head) *include *next))
|
|
1187 (head *next))
|
|
1188 (nil
|
|
1189 nil
|
|
1190 (((body) *include *next)
|
|
1191 ;; Netscape stuff
|
|
1192 ((frameset) *include 4)
|
|
1193 )
|
|
1194 (body *next))
|
|
1195 (nil
|
|
1196 nil
|
|
1197 (((plaintext) *include *next))
|
|
1198 (*retry *next))
|
|
1199 (nil
|
|
1200 nil
|
|
1201 nil
|
|
1202 (*include *same "after BODY"))
|
|
1203 (nil
|
|
1204 nil
|
|
1205 nil
|
|
1206 (*include *same "after FRAMESET"))
|
|
1207 ])
|
|
1208 (end-tag-omissible . t))
|
|
1209 ((head)
|
2
|
1210 (content-model . [((title isindex %headempty %headmisc
|
|
1211 style %head-deprecated)
|
0
|
1212 nil
|
|
1213 nil
|
|
1214 ;; *** Should only close if tag can
|
|
1215 ;; legitimately follow head. So many can that
|
|
1216 ;; I haven't bothered to enumerate them.
|
|
1217 (*close))])
|
|
1218 (end-tag-omissible . t))
|
2
|
1219 ;; SCRIPT - - (#PCDATA)
|
|
1220 ((script)
|
80
|
1221 (content-model . XCDATA ; not official, but allows
|
|
1222 ; comment hiding of script, and also
|
|
1223 ; idiots that use '</' in scripts.
|
2
|
1224 ))
|
0
|
1225 ;; TITLE - - (#PCDATA)
|
|
1226 ((title)
|
|
1227 (content-model . RCDATA ; not official
|
|
1228 ;; [((*data) include-space nil nil)]
|
|
1229 ))
|
|
1230 ;; STYLE - O (#PCDATA)
|
|
1231 ;; STYLE needs to be #PCDATA to allow omitted end tag. Bleagh.
|
|
1232 ((style)
|
2
|
1233 (content-model . CDATA)
|
0
|
1234 (end-tag-omissible . t))
|
|
1235 ((body)
|
|
1236 (content-model . [((banner) nil nil (*retry *next))
|
|
1237 ((bodytext) nil nil (bodytext *next))
|
|
1238 (nil nil (((plaintext) *close)) nil)])
|
|
1239 (inclusions . (spot))
|
|
1240 (end-tag-omissible . t))
|
|
1241 ;; Do I really want to include BODYTEXT? It has something to do
|
|
1242 ;; with mixed content screwing things up, and I don't understand
|
|
1243 ;; it. Wait! It's used by BQ!
|
|
1244 ((bodytext)
|
|
1245 (content-model . [((%body.content)
|
|
1246 nil
|
|
1247 ;; Push <P> before data characters. Non-SGML.
|
|
1248 (((%text) p)
|
|
1249 ;; Closing when seeing CREDIT is a stupidity
|
|
1250 ;; caused by BQ's sharing of BODYTEXT. BQ
|
|
1251 ;; should have its own BQTEXT.
|
|
1252 ((credit plaintext) *close))
|
|
1253 nil)])
|
|
1254 (end-tag-omissible . t))
|
82
|
1255 ((div banner center multicol)
|
0
|
1256 (content-model . [((%body.content)
|
|
1257 nil
|
|
1258 ;; Push <P> before data characters. Non-SGML.
|
|
1259 (((%text) p))
|
|
1260 nil)]))
|
|
1261 ((address)
|
|
1262 (content-model . [((p)
|
|
1263 nil
|
|
1264 ;; Push <P> before data characters. Non-SGML.
|
|
1265 (((%text) p))
|
|
1266 nil)]))
|
|
1267 ((%heading)
|
|
1268 (content-model . [((%text)
|
|
1269 include-space
|
|
1270 ((%in-text-ignore))
|
|
1271 nil)]))
|
80
|
1272 ((span bdo)
|
|
1273 (content-model . [((%text)
|
|
1274 include-space
|
|
1275 nil
|
|
1276 nil)])
|
|
1277 )
|
0
|
1278 ((p)
|
|
1279 (content-model . [((%text)
|
|
1280 include-space
|
|
1281 nil
|
|
1282 ;; *** Should only close if tag can
|
|
1283 ;; legitimately follow P. So many can that I
|
|
1284 ;; don't bother to enumerate here.
|
|
1285 (*close))])
|
|
1286 (end-tag-omissible . t))
|
|
1287 ((ul ol)
|
|
1288 (content-model . [((lh)
|
|
1289 nil
|
|
1290 (((li) *include *next))
|
|
1291 (*retry *next))
|
|
1292 ((li)
|
|
1293 nil
|
|
1294 ;; Push <LI> before data characters or block
|
|
1295 ;; elements.
|
|
1296 ;; Non-SGML.
|
|
1297 (((%text %block) li *same error))
|
|
1298 nil)]))
|
|
1299 ((lh)
|
|
1300 (content-model . [((%text)
|
|
1301 include-space
|
|
1302 (((dd dt li) *close)
|
|
1303 (%in-text-ignore))
|
|
1304 nil)])
|
|
1305 (end-tag-omissible . t))
|
|
1306 ((dir menu)
|
|
1307 (content-model . [((li)
|
|
1308 nil
|
|
1309 (((%text) li *same error))
|
|
1310 nil)])
|
|
1311 (exclusions . (%block))
|
|
1312 (deprecated . t))
|
|
1313 ((li)
|
|
1314 (content-model . [((%block)
|
|
1315 nil
|
|
1316 (((li) *close)
|
|
1317 ;; Push <P> before data characters. Non-SGML.
|
|
1318 ((%text) p))
|
|
1319 nil)])
|
|
1320 (end-tag-omissible . t)
|
|
1321 ;; Better bad HTML handling.
|
|
1322 ;; Technically, there are a few valid documents that this will
|
|
1323 ;; hose, because you can have H1 inside FORM inside LI. However,
|
|
1324 ;; I don't think that should be allowed anyway.
|
|
1325 (exclusions . (*discard "not allowed here" %heading)))
|
|
1326 ((dl)
|
|
1327 (content-model . [((lh)
|
|
1328 nil
|
|
1329 (((dt dd) *include *next))
|
|
1330 (*retry *next))
|
|
1331 ((dt dd)
|
|
1332 nil
|
|
1333 ;; Push <DD> before data characters or block
|
|
1334 ;; items.
|
|
1335 ;; Non-SGML.
|
|
1336 (((%text %block) dd *same error))
|
|
1337 nil)]))
|
|
1338 ((dt)
|
|
1339 (content-model . [((%text)
|
|
1340 include-space
|
|
1341 (((dd dt) *close)
|
|
1342 (%in-text-ignore))
|
|
1343 nil)])
|
|
1344 (end-tag-omissible . t))
|
|
1345 ;; DD is just like LI, but we treat it separately because it can be
|
|
1346 ;; followed by a different set of elements.
|
|
1347 ((dd)
|
|
1348 (content-model . [((%block)
|
|
1349 nil
|
|
1350 (((dt dd) *close)
|
|
1351 ;; Push <P> before data characters. Non-SGML.
|
|
1352 ((%text) p))
|
|
1353 nil)])
|
|
1354 (end-tag-omissible . t)
|
|
1355 ;; See comment with LI.
|
|
1356 (exclusions . (*discard "not allowed here" %heading)))
|
|
1357 ((pre)
|
|
1358 (content-model . [((%text hr)
|
|
1359 include-space
|
|
1360 ((%in-text-ignore))
|
|
1361 nil)])
|
|
1362 (exclusions . (%pre.exclusion)))
|
|
1363 ;; BLOCKQUOTE deprecated, BQ okay
|
|
1364 ((bq)
|
|
1365 (content-model . %bq-content-model))
|
|
1366 ((blockquote)
|
|
1367 (content-model . %bq-content-model)
|
|
1368 ;; BLOCKQUOTE is deprecated in favor of BQ in the HTML 3.0 DTD.
|
|
1369 ;; However, BQ is not even mentioned in the HTML 2.0 DTD. So I
|
|
1370 ;; don't think we can enable this yet:
|
|
1371 ;;(deprecated . t)
|
|
1372 )
|
|
1373 ((fn note)
|
|
1374 (content-model . [((%body.content)
|
|
1375 nil
|
|
1376 ;; Push <P> before data characters. Non-SGML.
|
|
1377 (((%text) p))
|
|
1378 nil)]))
|
|
1379 ((fig)
|
|
1380 (content-model . [((overlay) nil nil (*retry *next))
|
|
1381 (nil
|
|
1382 nil
|
|
1383 (((caption) *include *next))
|
|
1384 (*retry *next))
|
|
1385 (nil
|
|
1386 nil
|
|
1387 (((figtext) *include *next)
|
|
1388 ((credit) *retry *next))
|
|
1389 ;; *** Should only do this for elements that
|
|
1390 ;; can be in FIGTEXT.
|
|
1391 (figtext *next))
|
|
1392 (nil nil (((credit) *include *next)) nil)
|
|
1393 (nil nil nil nil)]))
|
|
1394 ((caption credit)
|
|
1395 (content-model . [((%text)
|
|
1396 nil
|
|
1397 ((%in-text-ignore))
|
|
1398 nil)]))
|
|
1399 ((figtext)
|
|
1400 (content-model . [((%body.content)
|
|
1401 nil
|
|
1402 ;; Push <P> before data characters. Very non-SGML.
|
|
1403 (((%text) p)
|
|
1404 ((credit) *close))
|
|
1405 nil)])
|
|
1406 (end-tag-omissible . t))
|
80
|
1407 ((%emacsw3-crud basefont)
|
0
|
1408 (content-model . EMPTY))
|
|
1409 ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA)
|
|
1410 ((form)
|
|
1411 ;; Same as BODY. Ugh!
|
|
1412 (content-model . [((%body.content)
|
|
1413 nil
|
|
1414 ;; Push <P> before data characters. Non-SGML.
|
|
1415 (((%text) p))
|
|
1416 nil)])
|
|
1417 (exclusions . (form))
|
|
1418 (inclusions . (input select textarea keygen label)))
|
|
1419 ;; *** Where is the URL describing this?
|
|
1420 ((label)
|
|
1421 (content-model . [((%text)
|
|
1422 include-space
|
|
1423 nil
|
|
1424 nil)])
|
|
1425 ;; *** These are already included, no need to repeat.
|
|
1426 ;;(inclusions . (input select textarea))
|
|
1427 ;; *** Is a LABEL allowed inside a LABEL? I assume no.
|
|
1428 (exclusions . (label))
|
|
1429 ;; The next line just does the default so is unneeded:
|
|
1430 ;;(end-tag-omissible . nil)
|
|
1431 )
|
|
1432 ;; SELECT - - (OPTION+) -(INPUT|KEYGEN|TEXTAREA|SELECT)>
|
|
1433 ;; *** This should be -(everything).
|
|
1434 ((select)
|
|
1435 (content-model . [((option) nil nil nil)])
|
|
1436 (exclusions . (input label select keygen textarea)))
|
|
1437 ;; option - O (#PCDATA)
|
|
1438 ;; needs to be #PCDATA to allow omitted end tag.
|
|
1439 ((option)
|
|
1440 ;; I'd like to make this RCDATA to avoid problems with inclusions
|
|
1441 ;; like SPOT, but that would conflict with the omitted end-tag, I
|
|
1442 ;; think.
|
|
1443 (content-model . [((*data)
|
|
1444 include-space
|
|
1445 (((option) *close))
|
|
1446 nil)])
|
|
1447 (end-tag-omissible . t))
|
|
1448 ;; TEXTAREA - - (#PCDATA) -(INPUT|TEXTAREA|KEYGEN|SELECT)
|
|
1449 ((textarea)
|
|
1450 ;; Same comment as for OPTION about RCDATA.
|
|
1451 (content-model . [((*data) include-space nil nil)])
|
|
1452 (exclusions . (input select label keygen textarea)))
|
|
1453 ((hr br img isindex input keygen overlay wbr spot tab
|
|
1454 %headempty %mathdelims)
|
|
1455 (content-model . EMPTY))
|
|
1456 ((nextid)
|
|
1457 (content-model . EMPTY)
|
|
1458 (deprecated . t))
|
|
1459 ((a)
|
|
1460 (content-model . [((%text)
|
|
1461 include-space
|
|
1462 (((%heading)
|
|
1463 *include *same "deprecated inside A")
|
|
1464 ;; *** I haven't made up my mind whether this
|
|
1465 ;; is a good idea. It can result in a lot of
|
|
1466 ;; bad formatting if the A is *never* closed.
|
|
1467 ;;((p) *discard *same error)
|
|
1468 )
|
|
1469 nil)])
|
|
1470 (exclusions . (a)))
|
2
|
1471 ((b font %font %phrase %misc nobr)
|
0
|
1472 (content-model . [((%text)
|
|
1473 include-space
|
|
1474 ((%in-text-ignore))
|
|
1475 nil)]))
|
|
1476 ((plaintext)
|
|
1477 (content-model . XXCDATA)
|
|
1478 (end-tag-omissible . t)
|
|
1479 (deprecated . obsolete))
|
|
1480 ((xmp listing)
|
|
1481 (content-model . XCDATA)
|
|
1482 (deprecated . obsolete))
|
|
1483 ;; Latest table spec (as of Nov. 13 1995) is at:
|
|
1484 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-tables-03.txt>
|
|
1485 ((table)
|
|
1486 (content-model . [(nil
|
|
1487 nil
|
|
1488 (((caption) *include *next)
|
108
|
1489 ((%text) tr *same error)
|
0
|
1490 ((col colgroup thead tfoot tbody tr) *retry *next))
|
|
1491 (*retry *next)) ;error handling
|
|
1492 ((col colgroup)
|
|
1493 nil
|
|
1494 (((thead tfoot tbody tr) *retry *next))
|
|
1495 (*retry *next)) ;error handling
|
|
1496 (nil
|
|
1497 nil
|
|
1498 (((thead) *include *next)
|
|
1499 ((tfoot tbody tr) *retry *next))
|
|
1500 (*retry *next)) ;error handling
|
|
1501 (nil
|
|
1502 nil
|
|
1503 (((tfoot) *include *next)
|
|
1504 ((tbody tr) *retry *next))
|
|
1505 (*retry *next)) ;error handling
|
|
1506 ((tbody)
|
|
1507 nil
|
|
1508 (((tr) tbody *same)
|
|
1509 ;; error handling
|
|
1510 ((%body.content) tbody *same error))
|
|
1511 nil)]))
|
|
1512 ((colgroup)
|
|
1513 (content-model . [((col)
|
|
1514 nil
|
|
1515 (((colgroup thead tfoot tbody tr) *close))
|
|
1516 nil)])
|
|
1517 (end-tag-omissible . t))
|
|
1518 ((col)
|
|
1519 (content-model . EMPTY))
|
|
1520 ((thead)
|
|
1521 (content-model . [((tr)
|
|
1522 nil
|
|
1523 (((tfoot tbody) *close)
|
|
1524 ;; error handling
|
|
1525 ((%body.content) tr *same error))
|
|
1526 nil)])
|
|
1527 (end-tag-omissible . t))
|
|
1528 ((tfoot tbody)
|
|
1529 (content-model . [((tr)
|
|
1530 nil
|
|
1531 (((tbody) *close)
|
|
1532 ;; error handling
|
|
1533 ((%body.content) tr *same error))
|
|
1534 nil)])
|
|
1535 (end-tag-omissible . t))
|
|
1536 ((tr)
|
|
1537 (content-model . [((td th)
|
|
1538 nil
|
|
1539 (((tr tfoot tbody) *close)
|
|
1540 ;; error handling
|
108
|
1541 ((%body.content %text) td *same error))
|
0
|
1542 nil)])
|
|
1543 (end-tag-omissible . t))
|
|
1544 ((td th)
|
|
1545 ;; Arrgh! Another %body.content!!! Stupid!!!
|
|
1546 (content-model . [((%body.content)
|
|
1547 nil
|
|
1548 (((td th tr tfoot tbody) *close)
|
|
1549 ;; Push <P> before data characters. Non-SGML.
|
|
1550 ((%text) p))
|
|
1551 nil)])
|
|
1552 (end-tag-omissible . t))
|
|
1553 ((math)
|
|
1554 (content-model . [((*data) include-space nil nil)])
|
|
1555 (overrides .
|
|
1556 ((w3-p-d-shortref-chars t . "\{_^")
|
|
1557 (w3-p-d-shortrefs t . (("\\^" . "<sup>")
|
|
1558 ("_" . "<sub>")
|
|
1559 ("{" . "<box>")))))
|
|
1560 (inclusions . (%math))
|
|
1561 (exclusions . (%notmath)))
|
|
1562 ((sup)
|
|
1563 (content-model . [((%text)
|
|
1564 include-space
|
|
1565 ((%in-text-ignore))
|
|
1566 nil)])
|
|
1567 (overrides .
|
|
1568 ((w3-p-d-shortref-chars t . "\{_^")
|
|
1569 (w3-p-d-shortrefs t . (("\\^" . "</sup>")
|
|
1570 ("_" . "<sub>")
|
|
1571 ("{" . "<box>"))))))
|
|
1572 ((sub)
|
|
1573 (content-model . [((%text)
|
|
1574 include-space
|
|
1575 ((%in-text-ignore))
|
|
1576 nil)])
|
|
1577 (overrides .
|
|
1578 ((w3-p-d-shortref-chars t . "\{_^")
|
|
1579 (w3-p-d-shortrefs t . (("\\^" . "<sup>")
|
|
1580 ("_" . "</sub>")
|
|
1581 ("{" . "<box>"))))))
|
|
1582 ((box)
|
|
1583 (content-model . [((%formula)
|
|
1584 include-space
|
|
1585 (((left) *include 1)
|
|
1586 ((over atop choose) *include 2)
|
|
1587 ((right) *include 3))
|
|
1588 nil)
|
|
1589 ((%formula)
|
|
1590 include-space
|
|
1591 (((over atop choose) *include 2)
|
|
1592 ((right) *include 3))
|
|
1593 nil)
|
|
1594 ((%formula)
|
|
1595 include-space
|
|
1596 (((right) *include 3))
|
|
1597 nil)
|
|
1598 ((%formula) include-space nil nil)])
|
|
1599 (overrides .
|
|
1600 ((w3-p-d-shortref-chars t . "{}_^")
|
|
1601 (w3-p-d-shortrefs t . (("\\^" . "<sup>")
|
|
1602 ("_" . "<sub>")
|
|
1603 ("{" . "<box>")
|
|
1604 ("}" . "</box>"))))))
|
|
1605 ((above below %mathvec t bt sqrt)
|
|
1606 (content-model . [((%formula) include-space nil nil)]))
|
|
1607 ;; ROOT has a badly-specified content-model in HTML 3.0.
|
|
1608 ((root)
|
|
1609 (content-model . [((%formula)
|
|
1610 include-space
|
|
1611 (((of) *include *next))
|
|
1612 nil)
|
|
1613 ((%formula) include-space nil nil)]))
|
|
1614 ((of)
|
|
1615 (content-model . [((%formula) include-space nil nil)])
|
|
1616 ;; There is no valid way to infer a missing end-tag for OF. This
|
|
1617 ;; is bizarre.
|
|
1618 (end-tag-omissible . t))
|
|
1619 ((array)
|
|
1620 (content-model . [((row) nil nil nil)]))
|
|
1621 ((row)
|
|
1622 (content-model . [((item) nil (((row) *close)) nil)])
|
|
1623 (end-tag-omissible . t))
|
|
1624 ((item)
|
|
1625 (content-model . [((%formula)
|
|
1626 include-space
|
|
1627 (((row item) *close))
|
|
1628 nil)])
|
|
1629 (end-tag-omissible . t))
|
|
1630 ;; The old parser would look for the </EMBED> end-tag and include
|
|
1631 ;; the contents between <EMBED> and </EMBED> as the DATA attribute
|
|
1632 ;; of the EMBED start-tag. However, it did not require the
|
|
1633 ;; </EMBED> end-tag and did nothing if it was missing. This is
|
|
1634 ;; completely impossible to specify in SGML.
|
|
1635 ;;
|
|
1636 ;; See
|
|
1637 ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0603.html>
|
|
1638 ;;
|
|
1639 ;; Questions: Does EMBED require the end-tag? How does NOEMBED fit
|
|
1640 ;; into this? Where can EMBED appear?
|
|
1641 ;;
|
|
1642 ;; Nov. 25 1995: a new spec for EMBED (also an I-D):
|
|
1643 ;; <URL:http://www.cs.princeton.edu/~burchard/www/interactive/>
|
|
1644 ;;
|
|
1645 ;; Here is my guess how to code EMBED:
|
|
1646 ((embed)
|
|
1647 (content-model . [((noembed) nil nil (*close))]))
|
|
1648 ((noembed)
|
|
1649 (content-model . [((%body.content) ; hack hack hack
|
|
1650 nil
|
|
1651 (((%text) p))
|
|
1652 nil)]))
|
|
1653 ;;
|
|
1654 ;; FRAMESET is a Netscape thing.
|
|
1655 ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0588.html>
|
|
1656 ((frameset)
|
|
1657 (content-model . [((noframes frame frameset) nil nil nil)]))
|
|
1658 ((noframes)
|
|
1659 (content-model . [((%body.content)
|
|
1660 nil
|
|
1661 ;; Push <P> before data characters. Non-SGML.
|
|
1662 (((%text) p))
|
|
1663 nil)]))
|
|
1664 ((frame)
|
|
1665 (content-model . EMPTY))
|
|
1666 ;;
|
|
1667 ;; APPLET is a Java thing.
|
80
|
1668 ;; OBJECT is a cougar thing
|
0
|
1669 ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README>
|
80
|
1670 ((applet object)
|
0
|
1671 ;; I really don't want to add another ANY content-model.
|
|
1672 (content-model . XINHERIT)
|
|
1673 (inclusions . (param)))
|
|
1674 ((param)
|
|
1675 (content-model . EMPTY))
|
|
1676 ;; backward compatibility with old Java.
|
|
1677 ((app)
|
|
1678 (content-model . EMPTY))
|
|
1679 ;; Client-side image maps.
|
|
1680 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-seidman-clientsideimagemap-01.txt>
|
|
1681 ;; *** The only problem is that I don't know in what elements MAP
|
|
1682 ;; can appear, so none of this is reachable yet.
|
|
1683 ((map)
|
|
1684 (content-model . [((area) nil nil nil)]))
|
|
1685 ((area)
|
|
1686 (content-model . EMPTY))
|
|
1687 )))))
|
|
1688
|
|
1689
|
|
1690 ;;;
|
|
1691 ;;; Omitted tag inference using state transition tables.
|
|
1692 ;;;
|
|
1693
|
|
1694 (eval-when-compile
|
|
1695
|
|
1696 (w3-p-s-var-def w3-p-s-includep)
|
|
1697 (w3-p-s-var-def w3-p-s-state-transitions)
|
|
1698 (w3-p-s-var-def w3-p-s-transition)
|
|
1699 (w3-p-s-var-def w3-p-s-tran-list)
|
|
1700 (w3-p-s-var-def w3-p-s-content-model)
|
|
1701 (w3-p-s-var-def w3-p-s-except)
|
80
|
1702 (w3-p-s-var-def w3-p-s-baseobject)
|
|
1703 (w3-p-s-var-def w3-p-s-btdt)
|
0
|
1704 ;; Uses free variables:
|
|
1705 ;; w3-p-d-current-element, w3-p-d-exceptions
|
|
1706 ;; Destroys free variables:
|
|
1707 ;; w3-p-s-includep, w3-p-s-state-transitions, w3-p-s-transition,
|
|
1708 ;; w3-p-s-tran-list, w3-p-s-content-model, w3-p-s-except
|
|
1709 ;; Returns t if the element or data characters should be included.
|
|
1710 ;; Returns nil if the element or data characters should be discarded.
|
|
1711 (defsubst w3-grok-tag-or-data (tag-name)
|
|
1712 (while
|
|
1713 (cond
|
|
1714 ((symbolp (setq w3-p-s-content-model
|
|
1715 (w3-element-content-model w3-p-d-current-element)))
|
|
1716 (or (and (memq w3-p-s-content-model
|
|
1717 '(CDATA RCDATA XCDATA XXCDATA))
|
|
1718 (memq tag-name '(*data *space)))
|
|
1719 ;; *** Implement ANY.
|
80
|
1720 (error "impossible content model lossage"))
|
0
|
1721 (setq w3-p-s-includep t)
|
|
1722 ;; Exit loop.
|
|
1723 nil)
|
|
1724 (t
|
|
1725 ;; We have a complex content model.
|
|
1726 ;; Cache some data from the element info structure. Format is:
|
|
1727 ;; (INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT)
|
|
1728 (setq w3-p-s-state-transitions
|
|
1729 (aref w3-p-s-content-model
|
|
1730 (w3-element-state w3-p-d-current-element)))
|
|
1731
|
|
1732 ;; Optimize the common cases.
|
|
1733 (cond
|
|
1734 ((eq '*space tag-name)
|
|
1735 ;; Optimizing the (*space *discard *same nil) transition.
|
|
1736 (setq w3-p-s-includep (car (cdr w3-p-s-state-transitions)))
|
|
1737 ;; Don't loop.
|
|
1738 nil)
|
|
1739 ((and (not (setq w3-p-s-except
|
|
1740 (assq tag-name w3-p-d-exceptions)))
|
|
1741 (memq tag-name (car w3-p-s-state-transitions)))
|
|
1742 ;; Equivalent to a transition of (TAG *include *same nil).
|
|
1743 ;; So we are done, return t to caller.
|
|
1744 (setq w3-p-s-includep t)
|
|
1745 ;; Exit loop.
|
|
1746 nil)
|
|
1747 (t
|
|
1748 ;; The general case.
|
|
1749 (cond
|
|
1750 ;; Handle inclusions and exclusions.
|
|
1751 (w3-p-s-except
|
|
1752 (setq w3-p-s-transition (cdr w3-p-s-except)))
|
|
1753 ;; See if the transition is in the complex transitions
|
|
1754 ;; component.
|
|
1755 ((progn
|
|
1756 (setq w3-p-s-tran-list
|
|
1757 (car (cdr (cdr w3-p-s-state-transitions))))
|
|
1758 (setq w3-p-s-transition nil)
|
|
1759 (while w3-p-s-tran-list
|
|
1760 (cond ((memq tag-name (car (car w3-p-s-tran-list)))
|
|
1761 ;; We've found a transition.
|
|
1762 (setq w3-p-s-transition
|
|
1763 (cdr (car w3-p-s-tran-list)))
|
|
1764 (setq w3-p-s-tran-list nil))
|
|
1765 (t
|
|
1766 (setq w3-p-s-tran-list (cdr w3-p-s-tran-list)))))
|
|
1767 ;; Check if we found it.
|
|
1768 w3-p-s-transition)
|
|
1769 ;; body of cond clause empty
|
|
1770 )
|
|
1771 ;; Try finding the transition in the DEFAULT component of the
|
|
1772 ;; transition table, but avoid doing this for unknown elements,
|
|
1773 ;; always use the default-default for them.
|
|
1774 ((and (or (eq '*data tag-name)
|
|
1775 (w3-known-element-p tag-name))
|
|
1776 (setq w3-p-s-transition
|
|
1777 (nth 3 w3-p-s-state-transitions)))
|
|
1778 ;; body of cond clause empty
|
|
1779 )
|
|
1780 (t
|
|
1781 ;; Supply a default-default transition.
|
|
1782 (if (not (or (eq '*data tag-name)
|
|
1783 (w3-known-element-p tag-name)))
|
|
1784 (setq w3-p-s-transition
|
|
1785 '(*discard *same "unknown element"))
|
|
1786
|
|
1787 ;; Decide whether to *close or *discard
|
|
1788 ;; based on whether this element would be
|
|
1789 ;; accepted as valid in an open ancestor.
|
|
1790 (let ((open-list w3-p-d-open-element-stack)
|
|
1791 (all-end-tags-omissible
|
|
1792 (w3-element-end-tag-omissible w3-p-d-current-element))
|
|
1793 state-transitions tran-list)
|
|
1794 (if (catch 'found
|
|
1795 (while open-list
|
|
1796 (setq state-transitions
|
|
1797 (aref (w3-element-content-model
|
|
1798 (car open-list))
|
|
1799 (w3-element-state (car open-list))))
|
|
1800 (if (memq tag-name (car state-transitions))
|
|
1801 (throw 'found t))
|
|
1802 (setq tran-list (nth 2 state-transitions))
|
|
1803 (while tran-list
|
|
1804 (cond ((memq tag-name (car (car tran-list)))
|
|
1805 (if (not (nth 3 (car tran-list)))
|
|
1806 ;; Not an error transition.
|
|
1807 (throw 'found t))
|
|
1808 (setq tran-list nil))
|
|
1809 (t
|
|
1810 (setq tran-list (cdr tran-list)))))
|
|
1811 ;; The input item is not accepted in this
|
|
1812 ;; ancestor. Try again in next ancestor.
|
|
1813 (or (w3-element-end-tag-omissible (car open-list))
|
|
1814 (setq all-end-tags-omissible nil))
|
|
1815 (setq open-list (cdr open-list)))
|
|
1816 nil)
|
|
1817 (setq w3-p-s-transition
|
|
1818 (if (w3-element-end-tag-omissible
|
|
1819 w3-p-d-current-element)
|
|
1820 (if all-end-tags-omissible
|
|
1821 ;; Probably indicates a need to debug
|
|
1822 ;; the DTD state-transition tables.
|
|
1823 '(*close *same
|
|
1824 "missing transition in DTD?")
|
|
1825 ;; Error will be reported later.
|
|
1826 '(*close *same))
|
|
1827 '(*close *same "not allowed here")))
|
|
1828 (setq w3-p-s-transition
|
|
1829 '(*discard *same "not allowed here")))))))
|
|
1830
|
|
1831 ;; We have found a transition to take. The transition is of
|
|
1832 ;; the format (ACTION NEW-STATE ERRORP) where the latter two
|
|
1833 ;; items are optional.
|
|
1834
|
|
1835 ;; First, handle any state-change.
|
|
1836 (or (memq (car-safe (cdr w3-p-s-transition)) '(nil *same))
|
|
1837 (w3-set-element-state
|
|
1838 w3-p-d-current-element
|
|
1839 (if (eq '*next (car-safe (cdr w3-p-s-transition)))
|
|
1840 (1+ (w3-element-state w3-p-d-current-element))
|
|
1841 (car-safe (cdr w3-p-s-transition)))))
|
|
1842
|
|
1843 ;; Handle any error message.
|
|
1844 (if (car-safe (cdr-safe (cdr w3-p-s-transition)))
|
|
1845 (w3-debug-html
|
|
1846 :mandatory-if (and (eq '*data tag-name)
|
|
1847 (eq '*discard (car w3-p-s-transition)))
|
|
1848 (format "Bad %s [%s], %s"
|
|
1849 (if (eq '*data tag-name)
|
|
1850 "data characters"
|
|
1851 (concat "start-tag "
|
|
1852 (w3-sgml-name-to-string tag-name)))
|
|
1853 (if (stringp (car (cdr (cdr w3-p-s-transition))))
|
|
1854 (car (cdr (cdr w3-p-s-transition)))
|
|
1855 "not allowed here")
|
|
1856 (let ((action (car w3-p-s-transition)))
|
|
1857 (cond ((eq '*discard action)
|
|
1858 "discarding bad item")
|
|
1859 ((eq '*close action)
|
|
1860 (concat "inferring </"
|
|
1861 (w3-sgml-name-to-string
|
|
1862 (w3-element-name
|
|
1863 w3-p-d-current-element))
|
|
1864 ">"))
|
|
1865 ((eq '*include action)
|
|
1866 "including bad item anyway")
|
|
1867 ((eq '*retry action)
|
|
1868 "*retry ??? you shouldn't see this")
|
|
1869 (t
|
|
1870 (concat "inferring <"
|
|
1871 (w3-sgml-name-to-string action)
|
|
1872 ">")))))))
|
|
1873
|
|
1874 ;; Handle the action.
|
|
1875 (cond
|
|
1876 ((eq '*include (car w3-p-s-transition))
|
|
1877 (setq w3-p-s-includep t)
|
|
1878 ;; Exit loop.
|
|
1879 nil)
|
|
1880 ((eq '*close (car w3-p-s-transition))
|
|
1881 ;; Perform end-tag inference.
|
|
1882 (w3-close-element) ; don't pass parameter
|
|
1883 ;; Loop and try again in parent element's content-model.
|
|
1884 t)
|
|
1885 ((eq '*discard (car w3-p-s-transition))
|
|
1886 (setq w3-p-s-includep nil)
|
|
1887 ;; Exit loop.
|
|
1888 nil)
|
|
1889 ((eq '*retry (car w3-p-s-transition))
|
|
1890 ;; Loop and try again after state change.
|
|
1891 t)
|
|
1892 ((symbolp (car w3-p-s-transition))
|
|
1893 ;; We need to open another element to contain the text,
|
|
1894 ;; probably a <P> (look in the state table).
|
|
1895 (w3-open-element (car w3-p-s-transition) nil)
|
|
1896 ;; Now we loop and try again in the new element's
|
|
1897 ;; content-model.
|
|
1898 t)
|
|
1899 (t
|
80
|
1900 (error "impossible transition")))))))
|
0
|
1901
|
|
1902 ;; Empty while loop body.
|
|
1903 )
|
|
1904
|
|
1905 ;; Return value to user indicating whether to include or discard item:
|
|
1906 ;; t ==> include
|
|
1907 ;; nil ==> discard
|
|
1908 w3-p-s-includep)
|
|
1909
|
|
1910 )
|
|
1911
|
|
1912
|
|
1913 ;;;
|
|
1914 ;;; Main parser.
|
|
1915 ;;;
|
|
1916
|
|
1917 (defvar w3-last-parse-tree nil
|
|
1918 "Used for debugging only. Stores the most recently computed parse tree
|
|
1919 \(a tree, not a parse tag stream\).")
|
|
1920
|
|
1921 (defun w3-display-parse-tree (&optional ptree)
|
|
1922 (interactive)
|
|
1923 (with-output-to-temp-buffer "W3 HTML Parse Tree"
|
|
1924 (set-buffer standard-output)
|
|
1925 (emacs-lisp-mode)
|
|
1926 (require 'pp)
|
|
1927 (pp (or ptree w3-last-parse-tree))))
|
|
1928
|
|
1929 (defalias 'w3-display-last-parse-tree 'w3-display-parse-tree)
|
|
1930
|
|
1931 ;; For compatibility with the old parser interface.
|
|
1932 (defalias 'w3-preparse-buffer 'w3-parse-buffer)
|
|
1933
|
|
1934 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
1935 ;; % %
|
|
1936 ;; % This is the *ONLY* valid entry point in this file! %
|
|
1937 ;; % DO NOT call any of the other functions! %
|
|
1938 ;; % %
|
|
1939 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
80
|
1940 (defun w3-parse-buffer (&optional buff)
|
0
|
1941 "Parse contents of BUFF as HTML.
|
|
1942 BUFF defaults to the value of url-working-buffer.
|
|
1943 Destructively alters contents of BUFF.
|
|
1944 Returns a data structure containing the parsed information."
|
|
1945
|
|
1946 (set-buffer (or buff url-working-buffer))
|
|
1947 (setq buff (current-buffer))
|
|
1948 (set-syntax-table w3-sgml-md-syntax-table)
|
|
1949 (buffer-disable-undo (current-buffer))
|
|
1950 (widen) ; sanity checking
|
|
1951 (goto-char (point-min))
|
|
1952 (setq case-fold-search t) ; allows smaller regexp patterns
|
|
1953
|
|
1954 ;; Some unknown pre-parse buffer munging.
|
|
1955 (if (fboundp 'sera-to-fidel-marker)
|
|
1956 (let ((sera-being-called-by-w3 t))
|
|
1957 ;; eval stops the compiler from complaining.
|
|
1958 (eval '(sera-to-fidel-marker))))
|
|
1959 (goto-char (point-min))
|
|
1960
|
|
1961 ;; *** Should premunge line boundaries.
|
|
1962 ;; ********************
|
|
1963
|
|
1964 (let* (
|
|
1965 ;; Speed hack, see the variable doc string.
|
|
1966 (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0)
|
|
1967 (* w3-gc-cons-threshold-multiplier
|
|
1968 gc-cons-threshold)
|
|
1969 gc-cons-threshold))
|
|
1970
|
|
1971 ;; Used to determine if we made any progress since the last loop.
|
|
1972 (last-loop-start (point-min))
|
|
1973
|
|
1974 ;; How many iterations of the main loop have occurred. Used only
|
|
1975 ;; to send messages to the user periodically, since this function
|
|
1976 ;; can take some time.
|
|
1977 (loop-count 0)
|
|
1978
|
|
1979 ;; Precomputing the loop-invariant parts of this for speed.
|
|
1980 (status-message-format
|
|
1981 (if url-show-status
|
|
1982 (format "Parsed %%3d%%%% of %d..." (- (point-max) (point-min)))))
|
|
1983
|
|
1984 ;; Use a float value for 100 if possible, otherwise integer.
|
|
1985 ;; Determine which we can use outside of the loop for speed.
|
|
1986 (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100))
|
|
1987
|
|
1988 ;; The buffer which contains the HTML we are parsing. This
|
|
1989 ;; variable is used to avoid using the more expensive
|
|
1990 ;; save-excursion.
|
|
1991 (parse-buffer (current-buffer))
|
|
1992
|
|
1993 ;; Points to start of region of text since the previous tag.
|
|
1994 (between-tags-start (point-min))
|
|
1995
|
|
1996 ;; Points past end of region of text since the previous tag. Only
|
|
1997 ;; non-nil when the region has been completely determined and is
|
|
1998 ;; ready to be processed.
|
|
1999 between-tags-end
|
|
2000
|
|
2001 ;; See doc string.
|
|
2002 w3-p-d-tag-name
|
|
2003
|
|
2004 ;; See doc string.
|
|
2005 w3-p-d-end-tag-p
|
|
2006
|
|
2007 ;; Is the tag we are looking at a null-end-tag-enabling
|
|
2008 ;; start-tag?
|
|
2009 net-tag-p
|
|
2010
|
|
2011 ;; Attributes of the tag we are looking at. An alist whose items
|
|
2012 ;; are pairs of the form (SYMBOL . STRING).
|
|
2013 tag-attributes
|
|
2014
|
|
2015 ;; Points past end of attribute value we are looking at. Points
|
|
2016 ;; past the syntactic construct, not the value of the attribute,
|
|
2017 ;; which may be at (1- attribute-value-end).
|
|
2018 attribute-value-end
|
|
2019
|
|
2020 ;; Points past end of tag we are looking at.
|
|
2021 tag-end
|
|
2022
|
|
2023 ;; See doc string.
|
|
2024 (w3-p-d-current-element (w3-fresh-element-for-tag '*document))
|
|
2025
|
|
2026 ;; See doc string.
|
|
2027 (w3-p-d-open-element-stack (list (w3-fresh-element-for-tag '*holder)))
|
|
2028
|
|
2029 ;; ***not implemented yet***
|
|
2030 (marked-section-undo-stack nil)
|
|
2031
|
|
2032 ;; See doc string.
|
|
2033 (w3-p-d-debug-url t)
|
|
2034
|
|
2035 ;; Any of the following variables with the comment ";*NESTED*"
|
|
2036 ;; are syntactic or semantic features that were introduced by
|
|
2037 ;; some containing element or marked section which will be undone
|
|
2038 ;; when we close that element or marked section.
|
|
2039
|
|
2040 ;; See doc string.
|
|
2041 (w3-p-d-non-markup-chars nil) ;*NESTED*
|
|
2042
|
|
2043 ;; See doc string.
|
|
2044 (w3-p-d-null-end-tag-enabled nil) ;*NESTED*
|
|
2045
|
|
2046 ;; See doc string.
|
|
2047 (w3-p-d-in-parsed-marked-section nil) ;*NESTED*
|
|
2048
|
|
2049 ;; See doc string.
|
|
2050 (w3-p-d-shortrefs nil) ;*NESTED*
|
|
2051
|
|
2052 ;; See doc string.
|
|
2053 (w3-p-d-shortref-chars nil) ;*NESTED*
|
|
2054
|
|
2055 ;; ******* maybe not needed.
|
|
2056 ;;
|
|
2057 ;; ;; Are we recognizing start-tags?
|
|
2058 ;; (recognizing-start-tags t) ;*NESTED*
|
|
2059 ;;
|
|
2060 ;; ;; Are we recognizing end-tags? If this is non-nil and not t,
|
|
2061 ;; ;; then only the end tag of the current open element is
|
|
2062 ;; ;; recognized.
|
|
2063 ;; (recognizing-end-tags t) ;*NESTED*
|
|
2064
|
|
2065 ;; See doc string.
|
|
2066 (w3-p-d-exceptions nil) ;*NESTED*
|
|
2067
|
|
2068 ;; Scratch variables used in this function
|
|
2069 ref attr-name attr-value content-model content open-list
|
|
2070 )
|
|
2071 ;; Scratch variables used by macros and defsubsts we call.
|
|
2072 (w3-p-s-let-bindings
|
|
2073 (w3-update-non-markup-chars)
|
80
|
2074 (setq w3-p-s-baseobject (url-generic-parse-url (url-view-url t)))
|
0
|
2075 ;; Main loop. Handle markup as follows:
|
|
2076 ;;
|
|
2077 ;; non-empty tag: Handle the region since the previous tag as PCDATA,
|
|
2078 ;; RCDATA, CDATA, if allowed by syntax. Then handle the tag.
|
|
2079 ;;
|
|
2080 ;; general entity (&name;): expand it and parse the result.
|
|
2081 ;;
|
|
2082 ;; shortref (_, {, }, and ^ in math stuff): Expand it and parse the
|
|
2083 ;; result.
|
|
2084 ;;
|
|
2085 ;; SGML marked section (<![ keywords [ conditional-text ]]>): Either
|
|
2086 ;; strip the delimiters and parse the result or delete.
|
|
2087 ;;
|
|
2088 ;; comment: Delete.
|
|
2089 ;;
|
|
2090 ;; empty tag (<>, </>): Handle as the appropriate tag.
|
|
2091 ;;
|
|
2092 ;; markup declaration (e.g. <!DOCTYPE ...>): Delete.
|
|
2093 ;;
|
|
2094 ;; SGML processing instruction (<?name>): Delete.
|
|
2095 ;;
|
|
2096 (while
|
|
2097 ;; Continue as long as we processed something last time and we
|
|
2098 ;; have more to process.
|
|
2099 (prog1
|
|
2100 (not (and (= last-loop-start (point))
|
|
2101 (eobp)))
|
|
2102 (setq last-loop-start (point)))
|
|
2103
|
|
2104 ;; Display progress messages if asked and/or do incremental display
|
|
2105 ;; of results
|
|
2106 (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40))
|
|
2107 (if status-message-format
|
|
2108 (message status-message-format
|
|
2109 ;; Percentage of buffer processed.
|
|
2110 (/ (* (point) one-hundred) (point-max))))))
|
|
2111
|
|
2112 ;; Go to next interesting thing in the buffer.
|
|
2113 (skip-chars-forward w3-p-d-non-markup-chars)
|
|
2114
|
|
2115 ;; We are looking at a markup-starting character, and invalid
|
|
2116 ;; character, or end of buffer.
|
|
2117 (cond
|
|
2118
|
70
|
2119 ((= ?< (following-char))
|
0
|
2120
|
|
2121 ;; We are looking at a tag, comment, markup declaration, SGML marked
|
|
2122 ;; section, SGML processing instruction, or non-markup "<".
|
|
2123 (forward-char)
|
|
2124 (cond
|
|
2125
|
|
2126 ((looking-at "/?\\([a-z][-a-z0-9.]*\\)")
|
|
2127 ;; We are looking at a non-empty tag.
|
80
|
2128
|
|
2129 ;; Downcase it in the buffer, to save creation of a string
|
|
2130 (downcase-region (match-beginning 1) (match-end 1))
|
0
|
2131 (setq w3-p-d-tag-name
|
80
|
2132 (intern (buffer-substring (match-beginning 1)
|
|
2133 (match-end 1))))
|
70
|
2134 (setq w3-p-d-end-tag-p (= ?/ (following-char)))
|
|
2135 (setq between-tags-end (1- (point)))
|
0
|
2136 (goto-char (match-end 0))
|
|
2137
|
|
2138 ;; Read the attributes from a start-tag.
|
80
|
2139 (if w3-p-d-end-tag-p
|
98
|
2140 (if (looking-at "[ \t\r\n/]*[<>]")
|
80
|
2141 nil
|
|
2142 ;; This is in here to deal with those idiots who stick
|
|
2143 ;; attribute/value pairs on end tags. *sigh*
|
|
2144 (w3-debug-html "Evil attributes on end tag.")
|
|
2145 (skip-chars-forward "^>"))
|
0
|
2146
|
|
2147 ;; Attribute values can be:
|
|
2148 ;; "STRING" where STRING does not contain the double quote
|
|
2149 ;; 'STRING' where STRING does not contain the single quote
|
|
2150 ;; name-start character, *name character
|
|
2151 ;; *name character
|
|
2152 ;; Digit, +name character
|
|
2153 ;; +Digit
|
|
2154 ;; or a SPACE-separated list of one of the last four
|
|
2155 ;; possibilities (there is a comment somewhere that this is a
|
|
2156 ;; misinterpretation of the grammar, so we ignore this
|
|
2157 ;; possibility).
|
|
2158 (while
|
|
2159 (looking-at
|
|
2160 (eval-when-compile
|
|
2161 (concat
|
|
2162 ;; Leading whitespace.
|
|
2163 "[ \n\r\t]*"
|
|
2164 ;; The attribute name, possibly with a bad syntax
|
|
2165 ;; component.
|
80
|
2166 "\\([a-z_][-a-z0-9.]*\\(\\([_][-a-z0-9._]*\\)?\\)\\)"
|
0
|
2167 ;; Trailing whitespace and perhaps an "=".
|
|
2168 "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)")))
|
|
2169
|
|
2170 (cond ((/= (match-beginning 2) (match-end 2))
|
|
2171 (w3-debug-html
|
|
2172 :nocontext
|
|
2173 (format "Bad attribute name syntax: %s"
|
|
2174 (buffer-substring (match-beginning 1)
|
|
2175 (match-end 1))))))
|
80
|
2176
|
|
2177 ;; Downcase it in the buffer, to save creation of a string
|
|
2178 (downcase-region (match-beginning 1) (match-end 1))
|
0
|
2179 (setq attr-name
|
80
|
2180 (intern (buffer-substring (match-beginning 1)
|
|
2181 (match-end 1))))
|
0
|
2182 (goto-char (match-end 0))
|
|
2183 (cond
|
|
2184 ((< (match-beginning 4) (match-end 4))
|
|
2185 ;; A value was specified (e.g. ATTRIBUTE=VALUE).
|
|
2186 (cond
|
|
2187 ((looking-at
|
|
2188 (eval-when-compile
|
|
2189 (concat
|
|
2190 ;; Literal with double quotes.
|
|
2191 "\"\\([^\"]*\\)\""
|
|
2192 "\\|"
|
|
2193 ;; Literal with single quotes.
|
80
|
2194 "'\\([^']*\\)'"
|
0
|
2195 "\\|"
|
|
2196 ;; Handle bad HTML conflicting with NET-enabling
|
|
2197 ;; start-tags.
|
|
2198 "\\([-a-z0-9.]+/[-a-z0-9._/#]+\\)[ \t\n\r>]"
|
|
2199 "\\|"
|
|
2200 ;; SGML NAME-syntax attribute value.
|
|
2201 "\\([-a-z0-9.]+\\)[ \t\n\r></]"
|
|
2202 )))
|
|
2203 (cond
|
|
2204 ((or (match-beginning 1)
|
|
2205 (match-beginning 2))
|
|
2206 ;; We have an attribute value literal.
|
|
2207 (narrow-to-region (1+ (match-beginning 0))
|
|
2208 (1- (match-end 0)))
|
|
2209
|
|
2210 ;; In attribute value literals, EE and RS are ignored
|
|
2211 ;; and RE and SEPCHAR characters sequences are
|
|
2212 ;; replaced by SPACEs.
|
|
2213 ;;
|
|
2214 ;; (There is no way right now to get RS into one of
|
|
2215 ;; these so that it can be ignored. This is due to
|
|
2216 ;; our using Unix line-handling conventions.)
|
|
2217 (skip-chars-forward "^&\t\n\r")
|
|
2218 (if (eobp)
|
|
2219 nil
|
|
2220 ;; We must expand entities and replace RS, RE,
|
|
2221 ;; and SEPCHAR.
|
|
2222 (goto-char (point-min))
|
|
2223 (while (progn
|
|
2224 (skip-chars-forward "^&")
|
|
2225 (not (eobp)))
|
|
2226 (w3-expand-entity-at-point-maybe))
|
80
|
2227 (subst-char-in-region (point-min) (point-max) ?\t ? )
|
|
2228 (subst-char-in-region (point-min) (point-max) ?\n ? ))
|
0
|
2229 ;; Set this after we have changed the size of the
|
|
2230 ;; attribute.
|
|
2231 (setq attribute-value-end (1+ (point-max))))
|
|
2232 ((match-beginning 4)
|
|
2233 (setq attribute-value-end (match-end 4))
|
|
2234 (narrow-to-region (point) attribute-value-end))
|
|
2235 ((match-beginning 3)
|
|
2236 (setq attribute-value-end (match-end 3))
|
|
2237 (narrow-to-region (point) attribute-value-end)
|
|
2238 ;; Horribly illegal non-SGML handling of bad
|
|
2239 ;; HTML on the net. This can break valid HTML.
|
|
2240 (setq attr-value (buffer-substring (point)
|
|
2241 (match-end 3)))
|
|
2242 (w3-debug-html :nocontext
|
|
2243 (format "Evil attribute value syntax: %s"
|
|
2244 (buffer-substring (point-min) (point-max)))))
|
|
2245 (t
|
80
|
2246 (error "impossible attribute value"))))
|
70
|
2247 ((memq (following-char) '(?\" ?'))
|
0
|
2248 ;; Missing terminating quote character.
|
|
2249 (narrow-to-region (point)
|
|
2250 (progn
|
|
2251 (forward-char 1)
|
80
|
2252 (skip-chars-forward "^ \t\n\r'\"<>")
|
0
|
2253 (setq attribute-value-end (point))))
|
|
2254 (w3-debug-html :nocontext
|
|
2255 (format "Attribute value missing end quote: %s"
|
|
2256 (buffer-substring (point-min) (point-max))))
|
|
2257 (narrow-to-region (1+ (point-min)) (point-max)))
|
|
2258 (t
|
|
2259 ;; We have a syntactically invalid attribute value. Let's
|
|
2260 ;; make a best guess as to what the author intended.
|
|
2261 (narrow-to-region (point)
|
|
2262 (progn
|
80
|
2263 (skip-chars-forward "^ \t\n\r'\"<>")
|
0
|
2264 (setq attribute-value-end (point))))
|
|
2265 (w3-debug-html :nocontext
|
|
2266 (format "Bad attribute value syntax: %s"
|
|
2267 (buffer-substring (point-min) (point-max))))))
|
|
2268 ;; Now we have isolated the attribute value. We need to
|
|
2269 ;; munge the value depending on the syntax of the
|
|
2270 ;; attribute.
|
|
2271 ;; *** Right now, we only implement the necessary munging
|
|
2272 ;; for CDATA attributes, which is none. I'm not sure why
|
|
2273 ;; this happens to work for other attributes right now.
|
|
2274 ;; For any other kind of attribute, we are supposed to
|
|
2275 ;; * smash case
|
|
2276 ;; * remove leading/trailing whitespace
|
|
2277 ;; * smash multiple space sequences into single spaces
|
|
2278 ;; * verify the syntax of each token
|
|
2279 (setq attr-value (buffer-substring (point-min) (point-max)))
|
80
|
2280 (case attr-name
|
|
2281 (class
|
|
2282 (setq attr-value (split-string attr-value "[ ,]+")))
|
|
2283 (align
|
|
2284 (if (string-match "^[ \t\r\n]*\\(.*\\)[ \t\r\n]*$"
|
|
2285 attr-value)
|
|
2286 (setq attr-value (downcase
|
|
2287 (substring attr-value
|
|
2288 (match-beginning 1)
|
|
2289 (match-end 1))))
|
|
2290 (setq attr-value (downcase attr-value)))
|
|
2291 (setq attr-value (intern attr-value)))
|
|
2292 ((src href)
|
|
2293 ;; I should expand URLs here
|
|
2294 )
|
|
2295 (otherwise nil)
|
|
2296 )
|
0
|
2297 (widen)
|
|
2298 (goto-char attribute-value-end))
|
|
2299 (t
|
|
2300 ;; No value was specified, in which case NAME should be
|
|
2301 ;; taken as ATTRIBUTE=NAME where NAME is one of the
|
|
2302 ;; enumerated values for ATTRIBUTE.
|
|
2303 ;; We assume here that ATTRIBUTE is the same as NAME.
|
|
2304 ;; *** Another piece of code will fix the attribute name if it
|
|
2305 ;; is wrong.
|
|
2306 (setq attr-value (symbol-name attr-name))))
|
|
2307
|
|
2308 ;; Accumulate the attributes.
|
|
2309 (setq tag-attributes (cons (cons attr-name attr-value)
|
80
|
2310 tag-attributes)))
|
|
2311
|
|
2312 (cond
|
|
2313 ((and (eq w3-p-d-tag-name 'base)
|
|
2314 (setq w3-p-s-baseobject
|
|
2315 (or (assq 'src tag-attributes)
|
|
2316 (assq 'href tag-attributes))))
|
|
2317 (setq w3-p-s-baseobject (url-generic-parse-url
|
|
2318 (cdr w3-p-s-baseobject))))
|
|
2319 ((setq w3-p-s-btdt (or (assq 'src tag-attributes)
|
|
2320 (assq 'href tag-attributes)
|
|
2321 (assq 'action tag-attributes)))
|
|
2322 (setcdr w3-p-s-btdt (url-expand-file-name (cdr w3-p-s-btdt)
|
|
2323 w3-p-s-baseobject))
|
|
2324 (setq w3-p-s-btdt (if (url-have-visited-url (cdr w3-p-s-btdt))
|
|
2325 ":visited"
|
|
2326 ":link"))
|
|
2327 (if (assq 'class tag-attributes)
|
|
2328 (setcdr (assq 'class tag-attributes)
|
|
2329 (cons w3-p-s-btdt
|
|
2330 (cdr (assq 'class tag-attributes))))
|
|
2331 (setq tag-attributes (cons (cons 'class (list w3-p-s-btdt))
|
|
2332 tag-attributes))))
|
|
2333 )
|
98
|
2334 (if (not (eq w3-p-d-tag-name 'input))
|
|
2335 nil
|
|
2336 (setq w3-p-s-btdt (concat ":"
|
|
2337 (downcase
|
|
2338 (or (cdr-safe
|
|
2339 (assq 'type tag-attributes))
|
|
2340 "text"))))
|
|
2341 (if (assq 'class tag-attributes)
|
|
2342 (setcdr (assq 'class tag-attributes)
|
|
2343 (cons w3-p-s-btdt
|
|
2344 (cdr (assq 'class tag-attributes))))
|
|
2345 (setq tag-attributes (cons (cons 'class (list w3-p-s-btdt))
|
|
2346 tag-attributes))))
|
80
|
2347 )
|
0
|
2348
|
|
2349 ;; Process the end of the tag.
|
|
2350 (skip-chars-forward " \t\n\r")
|
70
|
2351 (cond ((= ?> (following-char))
|
0
|
2352 ;; Ordinary tag end.
|
|
2353 (forward-char 1))
|
70
|
2354 ((and (= ?/ (following-char))
|
0
|
2355 (not w3-p-d-end-tag-p))
|
|
2356 ;; This is a NET-enabling start-tag.
|
|
2357 (setq net-tag-p t)
|
|
2358 (forward-char 1))
|
70
|
2359 ((= ?< (following-char))
|
0
|
2360 ;; *** Strictly speaking, the following text has to
|
|
2361 ;; lexically be STAGO or ETAGO, which means that it
|
|
2362 ;; can't match some other lexical unit.
|
|
2363 ;; Unclosed tag.
|
|
2364 nil)
|
|
2365 (t
|
|
2366 ;; Syntax error.
|
|
2367 (w3-debug-html
|
|
2368 (format "Bad unclosed %s%s tag"
|
|
2369 (if w3-p-d-end-tag-p "/" "")
|
|
2370 (w3-sgml-name-to-string w3-p-d-tag-name)))))
|
|
2371
|
|
2372 (setq tag-end (point)))
|
|
2373
|
|
2374 ((looking-at "/?>")
|
|
2375 ;; We are looking at an empty tag (<>, </>).
|
70
|
2376 (setq w3-p-d-end-tag-p (= ?/ (following-char)))
|
0
|
2377 (setq w3-p-d-tag-name (if w3-p-d-end-tag-p
|
|
2378 (w3-element-name w3-p-d-current-element)
|
|
2379 ;; *** Strictly speaking, if OMITTAG NO, then
|
|
2380 ;; we should use the most recently closed tag.
|
|
2381 ;; But OMITTAG YES in HTML and I'm lazy.
|
|
2382 (w3-element-name w3-p-d-current-element)))
|
|
2383 (setq tag-attributes nil)
|
|
2384 ;; *** Make sure this is not at top level.
|
|
2385 (setq between-tags-end (1- (point)))
|
|
2386 (setq tag-end (match-end 0)))
|
|
2387
|
|
2388 ;; *** In SGML, <(doctype)element> is valid tag syntax. This
|
|
2389 ;; cannot occur in HTML because the CONCUR option is off in the
|
|
2390 ;; SGML declaration.
|
|
2391
|
|
2392 ((looking-at "!--")
|
|
2393 ;; We found a comment, delete to end of comment.
|
|
2394 (delete-region
|
|
2395 (1- (point))
|
|
2396 (progn
|
|
2397 (forward-char 1)
|
|
2398 ;; Skip over pairs of -- ... --.
|
|
2399 (if (looking-at "\\(--[^-]*\\(-[^-]+\\)*--[ \t\r\n]*\\)+>")
|
|
2400 (goto-char (match-end 0))
|
|
2401 ;; Syntax error!
|
|
2402 (w3-debug-html
|
|
2403 "Bad comment (unterminated or unbalanced \"--\" pairs)")
|
|
2404 (forward-char 2)
|
|
2405 (or (re-search-forward "--[ \t\r\n]*>" nil t)
|
|
2406 (search-forward ">" nil t)))
|
|
2407 (point))))
|
|
2408
|
|
2409 ((looking-at "!>\\|\\?[^>]*>")
|
|
2410 ;; We are looking at an empty comment or a processing
|
|
2411 ;; instruction. Delete it.
|
|
2412 (replace-match "")
|
|
2413 (delete-char -1))
|
|
2414
|
|
2415 ((looking-at "![a-z]")
|
|
2416 ;; We are looking at a markup declaration. Delete it.
|
|
2417 ;; *** Technically speaking, to handle valid HTML I think we
|
|
2418 ;; need to handle "<!USEMAP ... >" declarations. In the future,
|
|
2419 ;; to handle general SGML, we should parse "<!DOCTYPE ... >"
|
|
2420 ;; declarations as well (which can contain other declarations).
|
|
2421 ;; In the very distant future, perhaps we will handle "<!SGML
|
|
2422 ;; ... >" declarations.
|
|
2423 ;; *** Should warn if it's not SGML, DOCTYPE, or USEMAP.
|
|
2424 (backward-char 1)
|
|
2425 (delete-region
|
|
2426 (point)
|
|
2427 (progn
|
|
2428 (condition-case nil
|
|
2429 (forward-sexp 1)
|
|
2430 (error
|
|
2431 ;; *** This might not actually be bad syntax, but might
|
|
2432 ;; instead be a -- ... -- comment with unbalanced
|
|
2433 ;; parentheses somewhere inside the declaration. Handling
|
|
2434 ;; this properly would require full parsing of markup
|
|
2435 ;; declarations, a goal for the future.
|
|
2436 (w3-debug-html "Bad <! syntax.")
|
|
2437 (skip-chars-forward "^>")
|
70
|
2438 (if (= ?> (following-char))
|
0
|
2439 (forward-char))))
|
|
2440 (point))))
|
|
2441
|
|
2442 ((looking-at "!\\\[\\(\\([ \t\n\r]*[a-z]+\\)+[ \t\n\r]*\\)\\\[")
|
|
2443 ;; We are looking at a marked section.
|
|
2444 ;; *** Strictly speaking, we should issue a warning if the
|
|
2445 ;; keywords are invalid or missing or if the "[" does not follow.
|
|
2446 ;; We must look at the keywords to understand how to parse it.
|
|
2447 ;; *** Strictly speaking, we should perform parameter entity
|
|
2448 ;; substitution on the keywords first.
|
|
2449 (goto-char (match-beginning 1))
|
|
2450 (insert ?\))
|
|
2451 (goto-char (1- (match-beginning 0)))
|
|
2452 (delete-char 3)
|
|
2453 (insert ?\()
|
|
2454 (backward-char 1)
|
|
2455 (let* ((keywords (read (current-buffer)))
|
|
2456 ;; Multiple keywords may appear, but only the most
|
|
2457 ;; significant takes effect. Rank order is IGNORE, CDATA,
|
|
2458 ;; RCDATA, INCLUDE, and TEMP. INCLUDE and TEMP have the
|
|
2459 ;; same effect.
|
|
2460 (keyword (car-safe (cond ((memq 'IGNORE keywords))
|
|
2461 ((memq 'CDATA keywords))
|
|
2462 ((memq 'RCDATA keywords))
|
|
2463 ((memq 'INCLUDE keywords))
|
|
2464 ((memq 'TEMP keywords))))))
|
70
|
2465 (or (= ?\[ (following-char))
|
0
|
2466 ;; I probably shouldn't even check this, since it is so
|
|
2467 ;; impossible.
|
80
|
2468 (error "impossible ??"))
|
0
|
2469 (forward-char 1)
|
|
2470 (delete-region (1- (match-beginning 0)) (point))
|
|
2471 (cond ((eq 'IGNORE keyword)
|
|
2472 ;; Scan forward skipping over matching <![ ... ]]>
|
|
2473 ;; until we find an unmatched "]]>".
|
|
2474 (let ((ignore-nesting 1)
|
|
2475 (start-pos (point)))
|
|
2476 (while (> ignore-nesting 0)
|
|
2477 (if (re-search-forward "<!\\\\\[\\|\]\]>" nil t)
|
|
2478 (setq ignore-nesting
|
|
2479 (if (eq ?> (preceding-char))
|
|
2480 (1- ignore-nesting)
|
|
2481 (1+ ignore-nesting)))
|
|
2482 (w3-debug-html
|
|
2483 "Unterminated IGNORE marked section.")
|
|
2484 (setq ignore-nesting 0)
|
|
2485 (goto-char start-pos)))
|
|
2486 (delete-region start-pos (point))))
|
|
2487 ((eq 'CDATA keyword)
|
|
2488 (error "***unimplemented***"))
|
|
2489 ((eq 'RCDATA keyword)
|
|
2490 (error "***unimplemented***"))
|
|
2491 ((memq keyword '(INCLUDE TEMP))
|
|
2492 (error "***unimplemented***")))))
|
|
2493 ((and (looking-at "!")
|
|
2494 w3-netscape-compatible-comments)
|
|
2495 ;; Horribly illegal non-SGML handling of bad HTML on the net.
|
|
2496 ;; This can break valid HTML.
|
|
2497 ;; This arises because Netscape discards anything looking like
|
|
2498 ;; "<!...>". So people expect they can use this construct as
|
|
2499 ;; a comment.
|
|
2500 (w3-debug-html "Evil <! comment syntax.")
|
|
2501 (backward-char 1)
|
|
2502 (delete-region
|
|
2503 (point)
|
|
2504 (progn
|
|
2505 (skip-chars-forward "^>")
|
70
|
2506 (if (= ?> (following-char))
|
0
|
2507 (forward-char))
|
|
2508 (point))))
|
|
2509 (t
|
|
2510 ;; This < is not a markup character. Pretend we didn't notice
|
|
2511 ;; it at all. We have skipped over the < already, so just loop
|
|
2512 ;; again.
|
|
2513 )))
|
|
2514
|
70
|
2515 ((= ?& (following-char))
|
0
|
2516 (w3-expand-entity-at-point-maybe))
|
|
2517
|
70
|
2518 ((and (= ?\] (following-char))
|
0
|
2519 w3-p-d-in-parsed-marked-section
|
|
2520 (looking-at "]]>"))
|
|
2521 ;; *** handle the end of a parsed marked section.
|
|
2522 (error "***unimplemented***"))
|
|
2523
|
70
|
2524 ((and (= ?/ (following-char))
|
0
|
2525 w3-p-d-null-end-tag-enabled)
|
|
2526 ;; We are looking at a null end tag.
|
|
2527 (setq w3-p-d-end-tag-p t)
|
|
2528 (setq between-tags-end (point))
|
|
2529 (setq tag-end (1+ (point)))
|
|
2530 (setq w3-p-d-tag-name (w3-element-name w3-p-d-current-element)))
|
|
2531
|
|
2532 ;; This can be slow, since we'll hardly ever get here.
|
|
2533 ;; *** Strictly speaking, I think we're supposed to handle
|
|
2534 ;; shortrefs that begin with the same characters as other markup,
|
|
2535 ;; preferring the longest match.
|
|
2536 ;; I will assume that shortrefs never begin with <, &, \], /.
|
|
2537 ((setq ref (catch 'found-shortref
|
|
2538 (let ((refs w3-p-d-shortrefs))
|
|
2539 (while refs
|
|
2540 (if (looking-at (car (car refs)))
|
|
2541 (throw 'found-shortref (cdr (car refs))))
|
|
2542 (setq refs (cdr refs))))))
|
|
2543 ;; We are looking at a shortref for which there is an
|
|
2544 ;; expansion defined in the current syntax. Replace with the
|
|
2545 ;; expansion, leaving point at the beginning so it will be parsed
|
|
2546 ;; on the next loop.
|
|
2547 ;; *** eek. This is wrong if the shortref is for an entity with
|
|
2548 ;; CDATA syntax which should not be reparsed for tags.
|
|
2549 (replace-match "")
|
|
2550 (let ((pt (point)))
|
|
2551 (insert ref)
|
|
2552 (goto-char pt)))
|
|
2553
|
|
2554 ((looking-at (eval-when-compile
|
|
2555 (concat "[" (w3-invalid-sgml-chars) "]")))
|
|
2556 (w3-debug-html
|
70
|
2557 (format "Invalid SGML character: %c" (following-char)))
|
|
2558 (insert (or (cdr-safe (assq (following-char)
|
0
|
2559 ;; These characters are apparently
|
|
2560 ;; from a Windows character set.
|
|
2561 '((146 . "'")
|
|
2562 (153 . "TM"))))
|
|
2563 ""))
|
|
2564 (delete-char 1))
|
|
2565
|
|
2566 ((eobp)
|
|
2567 ;; We have finished the buffer. Make sure we process the last
|
|
2568 ;; piece of text, if any.
|
|
2569 (setq between-tags-end (point))
|
|
2570 ;; We have to test what's on the element stack because this
|
|
2571 ;; piece of code gets executed twice.
|
|
2572 (cond ((not (eq '*holder (w3-element-name w3-p-d-current-element)))
|
|
2573 ;; This forces the calculation of implied omitted end tags.
|
|
2574 (setq w3-p-d-tag-name '*document)
|
|
2575 (setq w3-p-d-end-tag-p t)
|
|
2576 (setq tag-end (point)))))
|
|
2577
|
|
2578 (t
|
|
2579 (error "unreachable code, this can't happen")))
|
|
2580
|
|
2581 ;; If we have determined the boundaries of a non-empty between-tags
|
|
2582 ;; region of text, then handle it.
|
|
2583 (cond
|
|
2584 (between-tags-end
|
|
2585 (cond
|
|
2586 ((< between-tags-start between-tags-end)
|
|
2587 ;; We have a non-empty between-tags region.
|
|
2588
|
|
2589 ;; We check if it's entirely whitespace, because we record the
|
|
2590 ;; transitions for whitespace separately from those for
|
|
2591 ;; data with non-whitespace characters.
|
|
2592 (goto-char between-tags-start)
|
|
2593 (skip-chars-forward " \t\n\r" between-tags-end)
|
|
2594 (cond
|
|
2595 ((w3-grok-tag-or-data (prog1
|
|
2596 (if (= between-tags-end (point))
|
|
2597 '*space
|
|
2598 '*data)
|
|
2599 (goto-char between-tags-end)))
|
|
2600 ;; We have to include the text in the current element's
|
|
2601 ;; contents. If this is the first item in the current
|
|
2602 ;; element's contents, don't include a leading newline if
|
|
2603 ;; there is one. Add a trailing newline as a separate text
|
|
2604 ;; item so that it can be removed later if it turns out to
|
|
2605 ;; be the last item in the current element's contents when
|
|
2606 ;; the current element is closed.
|
|
2607 ;; *** We could perform this test before calling
|
|
2608 ;; w3-grok-tag-or-data, but it's not clear which will be
|
|
2609 ;; faster in practice.
|
|
2610 (or (setq content (w3-element-content w3-p-d-current-element))
|
|
2611 ;; *** Strictly speaking, in SGML the record end is
|
|
2612 ;; carriage return, not line feed.
|
70
|
2613 (if (= ?\n (char-after between-tags-start))
|
0
|
2614 (setq between-tags-start (1+ between-tags-start))))
|
|
2615 (if (= between-tags-start (point))
|
|
2616 ;; Do nothing.
|
|
2617 nil
|
|
2618 ;; We are definitely going to add data characters to the
|
|
2619 ;; content.
|
|
2620 (cond
|
|
2621 ((and (= ?\n (preceding-char))
|
|
2622 (/= between-tags-start (1- (point))))
|
|
2623 (setq content (cons (buffer-substring between-tags-start
|
|
2624 (1- (point)))
|
|
2625 content))
|
|
2626 (setq content (cons "\n" content)))
|
|
2627 (t
|
|
2628 (setq content (cons (buffer-substring between-tags-start
|
|
2629 (point))
|
|
2630 content))))
|
|
2631 (w3-set-element-content w3-p-d-current-element content))))))
|
|
2632
|
|
2633 (setq between-tags-end nil)))
|
|
2634
|
|
2635 ;; If the previous expression modified (point), then it went to
|
|
2636 ;; the value of between-tags-end.
|
|
2637
|
|
2638 ;; If we found a start or end-tag, we need to handle it.
|
|
2639 (cond
|
|
2640 (w3-p-d-tag-name
|
|
2641
|
|
2642 ;; Move past the tag and prepare for next between-tags region.
|
|
2643 (goto-char tag-end)
|
|
2644 (setq between-tags-start (point))
|
|
2645
|
|
2646 (cond
|
|
2647 (w3-p-d-end-tag-p
|
|
2648 ;; Handle an end-tag.
|
|
2649 (if (eq w3-p-d-tag-name (w3-element-name w3-p-d-current-element))
|
|
2650 (w3-close-element)
|
|
2651 ;; Handle the complex version. We have to search up (down?)
|
|
2652 ;; the open element stack to find the element that matches (if
|
|
2653 ;; any). Then we close all of the elements. On a conforming
|
|
2654 ;; SGML document this can do no wrong and it's not
|
|
2655 ;; unreasonable on a non-conforming document.
|
|
2656
|
|
2657 ;; Can't safely modify stack until we know the element we want
|
|
2658 ;; to find is in there, so work with a copy.
|
|
2659 (setq open-list w3-p-d-open-element-stack)
|
|
2660 (while (and open-list
|
|
2661 (not (eq w3-p-d-tag-name
|
|
2662 (w3-element-name (car open-list)))))
|
|
2663 (setq open-list (cdr open-list)))
|
|
2664 (cond (open-list
|
|
2665 ;; We found a match. Pop elements.
|
|
2666 ;; We will use the following value as a sentinel.
|
|
2667 (setq open-list (cdr open-list))
|
|
2668 (while (not (eq open-list w3-p-d-open-element-stack))
|
|
2669 (w3-close-element t))
|
|
2670 (w3-close-element))
|
|
2671 (t
|
|
2672 ;; Bogus end tag.
|
|
2673 (w3-debug-html
|
|
2674 (format "Unmatched end-tag </%s>"
|
|
2675 (w3-sgml-name-to-string w3-p-d-tag-name)))))))
|
|
2676 (t
|
|
2677 ;; Handle a start-tag.
|
|
2678 (cond
|
|
2679 ;; Check if the new element is allowed in the current element's
|
|
2680 ;; content model.
|
|
2681 ((w3-grok-tag-or-data w3-p-d-tag-name)
|
|
2682 (w3-open-element w3-p-d-tag-name tag-attributes)
|
|
2683
|
|
2684 ;; Handle NET-enabling start tags.
|
|
2685 (cond ((and net-tag-p
|
|
2686 (not w3-p-d-null-end-tag-enabled))
|
|
2687 ;; Save old values.
|
|
2688 (w3-set-element-undo-list
|
|
2689 w3-p-d-current-element
|
|
2690 (cons (cons 'w3-p-d-non-markup-chars
|
|
2691 w3-p-d-non-markup-chars)
|
|
2692 (cons '(w3-p-d-null-end-tag-enabled . nil)
|
|
2693 (w3-element-undo-list w3-p-d-current-element))))
|
|
2694 ;; Alter syntax.
|
|
2695 (setq w3-p-d-null-end-tag-enabled t)
|
|
2696 (w3-update-non-markup-chars)))
|
|
2697
|
|
2698 (setq content-model
|
|
2699 (w3-element-content-model w3-p-d-current-element))
|
|
2700
|
|
2701 ;; If the element does not have parsed contents, then we
|
|
2702 ;; can find its contents immediately.
|
|
2703 (cond
|
|
2704 ((memq content-model '(EMPTY CDATA XCDATA XXCDATA RCDATA))
|
|
2705 (cond
|
|
2706 ((eq 'EMPTY content-model)
|
|
2707 (w3-close-element))
|
|
2708 ((eq 'CDATA content-model)
|
|
2709 ;; CDATA: all data characters until an end-tag. We'll
|
|
2710 ;; process the end-tag on the next loop.
|
|
2711 (if (re-search-forward (if w3-p-d-null-end-tag-enabled
|
|
2712 "</[a-z>]\\|/"
|
|
2713 "</[a-z>]")
|
|
2714 nil 'move)
|
|
2715 (goto-char (match-beginning 0))))
|
|
2716 ((eq 'XCDATA content-model)
|
|
2717 ;; XCDATA: special non-SGML-standard mode which includes
|
|
2718 ;; all data characters until "</foo" is seen where "foo"
|
|
2719 ;; is the name of this element (for XMP and LISTING).
|
|
2720 (if (search-forward
|
|
2721 (concat "</" (symbol-name
|
|
2722 (w3-element-name w3-p-d-current-element)))
|
|
2723 nil 'move)
|
|
2724 (goto-char (match-beginning 0))))
|
|
2725 ((eq 'XXCDATA content-model)
|
|
2726 ;; XXCDATA: special non-SGML-standard mode which includes
|
|
2727 ;; all data until end-of-entity (end-of-buffer for us)
|
|
2728 ;; (for PLAINTEXT).
|
|
2729 (goto-char (point-max)))
|
|
2730 ((eq 'RCDATA content-model)
|
|
2731 ;; RCDATA: all data characters until end-tag is seen,
|
|
2732 ;; except that entities are expanded first, although the
|
|
2733 ;; expansions are _not_ scanned for end-tags, although the
|
|
2734 ;; expansions _are_ scanned for further entity
|
|
2735 ;; references.
|
|
2736 (while (progn
|
|
2737 (if (re-search-forward (if w3-p-d-null-end-tag-enabled
|
|
2738 "</[a-z>]\\|[/&]"
|
|
2739 "</[a-z>]\\|&")
|
|
2740 nil 'move)
|
|
2741 (goto-char (match-beginning 0)))
|
70
|
2742 (= ?& (following-char)))
|
0
|
2743 (w3-expand-entity-at-point-maybe)))))))
|
|
2744 (t
|
|
2745 ;; The element is illegal here. We'll just discard the start
|
|
2746 ;; tag as though we never saw it.
|
|
2747 ))))
|
|
2748
|
|
2749 (setq w3-p-d-tag-name nil)
|
|
2750 (setq w3-p-d-end-tag-p nil)
|
|
2751 (setq net-tag-p nil)
|
|
2752 (setq tag-attributes nil)
|
|
2753 (setq tag-end nil)))
|
|
2754
|
|
2755 ;; End of main while loop.
|
|
2756 )
|
|
2757
|
|
2758 ;; We have finished parsing the buffer!
|
|
2759 (if status-message-format
|
|
2760 (message "%sdone" (format status-message-format 100)))
|
|
2761
|
|
2762 ;; *** For debugging, save the true parse tree.
|
|
2763 ;; *** Make this look inside *DOCUMENT.
|
|
2764 (setq w3-last-parse-tree
|
|
2765 (w3-element-content w3-p-d-current-element))
|
80
|
2766
|
|
2767 (w3-element-content w3-p-d-current-element)
|
70
|
2768 )))
|
14
|
2769
|
70
|
2770
|
0
|
2771
|
|
2772 (provide 'w3-parse)
|
|
2773
|
|
2774 ;; Local variables:
|
|
2775 ;; indent-tabs-mode: nil
|
|
2776 ;; end:
|