comparison program/lib/Roundcube/rcube_washtml.php @ 0:4681f974d28b

vanilla 1.3.3 distro, I hope
author Charlie Root
date Thu, 04 Jan 2018 15:52:31 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4681f974d28b
1 <?php
2
3 /**
4 +-----------------------------------------------------------------------+
5 | This file is part of the Roundcube Webmail client |
6 | Copyright (C) 2008-2012, The Roundcube Dev Team |
7 | |
8 | Licensed under the GNU General Public License version 3 or |
9 | any later version with exceptions for skins & plugins. |
10 | See the README file for a full license statement. |
11 | |
12 | PURPOSE: |
13 | Utility class providing HTML sanityzer (based on Washtml class) |
14 +-----------------------------------------------------------------------+
15 | Author: Thomas Bruederli <roundcube@gmail.com> |
16 | Author: Aleksander Machniak <alec@alec.pl> |
17 | Author: Frederic Motte <fmotte@ubixis.com> |
18 +-----------------------------------------------------------------------+
19 */
20
21 /*
22 * Washtml, a HTML sanityzer.
23 *
24 * Copyright (c) 2007 Frederic Motte <fmotte@ubixis.com>
25 * All rights reserved.
26 *
27 * Redistribution and use in source and binary forms, with or without
28 * modification, are permitted provided that the following conditions
29 * are met:
30 * 1. Redistributions of source code must retain the above copyright
31 * notice, this list of conditions and the following disclaimer.
32 * 2. Redistributions in binary form must reproduce the above copyright
33 * notice, this list of conditions and the following disclaimer in the
34 * documentation and/or other materials provided with the distribution.
35 *
36 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
37 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
39 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
40 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
45 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46 *
47 * OVERVIEW:
48 *
49 * Wahstml take an untrusted HTML and return a safe html string.
50 *
51 * SYNOPSIS:
52 *
53 * $washer = new washtml($config);
54 * $washer->wash($html);
55 * It return a sanityzed string of the $html parameter without html and head tags.
56 * $html is a string containing the html code to wash.
57 * $config is an array containing options:
58 * $config['allow_remote'] is a boolean to allow link to remote images.
59 * $config['blocked_src'] string with image-src to be used for blocked remote images
60 * $config['show_washed'] is a boolean to include washed out attributes as x-washed
61 * $config['cid_map'] is an array where cid urls index urls to replace them.
62 * $config['charset'] is a string containing the charset of the HTML document if it is not defined in it.
63 * $washer->extlinks is a reference to a boolean that is set to true if remote images were removed. (FE: show remote images link)
64 *
65 * INTERNALS:
66 *
67 * Only tags and attributes in the static lists $html_elements and $html_attributes
68 * are kept, inline styles are also filtered: all style identifiers matching
69 * /[a-z\-]/i are allowed. Values matching colors, sizes, /[a-z\-]/i and safe
70 * urls if allowed and cid urls if mapped are kept.
71 *
72 * Roundcube Changes:
73 * - added $block_elements
74 * - changed $ignore_elements behaviour
75 * - added RFC2397 support
76 * - base URL support
77 * - invalid HTML comments removal before parsing
78 * - "fixing" unitless CSS values for XHTML output
79 * - SVG and MathML support
80 */
81
82 /**
83 * Utility class providing HTML sanityzer
84 *
85 * @package Framework
86 * @subpackage Utils
87 */
88 class rcube_washtml
89 {
90 /* Allowed HTML elements (default) */
91 static $html_elements = array('a', 'abbr', 'acronym', 'address', 'area', 'b',
92 'basefont', 'bdo', 'big', 'blockquote', 'br', 'caption', 'center',
93 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
94 'dt', 'em', 'fieldset', 'font', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i',
95 'ins', 'label', 'legend', 'li', 'map', 'menu', 'nobr', 'ol', 'p', 'pre', 'q',
96 's', 'samp', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
97 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'wbr', 'img',
98 'video', 'source',
99 // form elements
100 'button', 'input', 'textarea', 'select', 'option', 'optgroup',
101 // SVG
102 'svg', 'altglyph', 'altglyphdef', 'altglyphitem', 'animate',
103 'animatecolor', 'animatetransform', 'circle', 'clippath', 'defs', 'desc',
104 'ellipse', 'font', 'g', 'glyph', 'glyphref', 'hkern', 'image', 'line',
105 'lineargradient', 'marker', 'mask', 'mpath', 'path', 'pattern',
106 'polygon', 'polyline', 'radialgradient', 'rect', 'set', 'stop', 'switch', 'symbol',
107 'text', 'textpath', 'tref', 'tspan', 'use', 'view', 'vkern', 'filter',
108 // SVG Filters
109 'feblend', 'fecolormatrix', 'fecomponenttransfer', 'fecomposite',
110 'feconvolvematrix', 'fediffuselighting', 'fedisplacementmap',
111 'feflood', 'fefunca', 'fefuncb', 'fefuncg', 'fefuncr', 'fegaussianblur',
112 'feimage', 'femerge', 'femergenode', 'femorphology', 'feoffset',
113 'fespecularlighting', 'fetile', 'feturbulence',
114 // MathML
115 'math', 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mlabeledtr',
116 'mmuliscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', 'mroot', 'mrow',
117 'ms', 'mspace', 'msqrt', 'mstyle', 'msub', 'msup', 'msubsup', 'mtable', 'mtd',
118 'mtext', 'mtr', 'munder', 'munderover', 'maligngroup', 'malignmark',
119 'mprescripts', 'semantics', 'annotation', 'annotation-xml', 'none',
120 'infinity', 'matrix', 'matrixrow', 'ci', 'cn', 'sep', 'apply',
121 'plus', 'minus', 'eq', 'power', 'times', 'divide', 'csymbol', 'root',
122 'bvar', 'lowlimit', 'uplimit',
123 );
124
125 /* Ignore these HTML tags and their content */
126 static $ignore_elements = array('script', 'applet', 'embed', 'object', 'style');
127
128 /* Allowed HTML attributes */
129 static $html_attribs = array('name', 'class', 'title', 'alt', 'width', 'height',
130 'align', 'nowrap', 'col', 'row', 'id', 'rowspan', 'colspan', 'cellspacing',
131 'cellpadding', 'valign', 'bgcolor', 'color', 'border', 'bordercolorlight',
132 'bordercolordark', 'face', 'marginwidth', 'marginheight', 'axis', 'border',
133 'abbr', 'char', 'charoff', 'clear', 'compact', 'coords', 'vspace', 'hspace',
134 'cellborder', 'size', 'lang', 'dir', 'usemap', 'shape', 'media',
135 'background', 'src', 'poster', 'href',
136 // attributes of form elements
137 'type', 'rows', 'cols', 'disabled', 'readonly', 'checked', 'multiple', 'value',
138 // SVG
139 'accent-height', 'accumulate', 'additive', 'alignment-baseline', 'alphabetic',
140 'ascent', 'attributename', 'attributetype', 'azimuth', 'basefrequency', 'baseprofile',
141 'baseline-shift', 'begin', 'bias', 'by', 'clip', 'clip-path', 'clip-rule',
142 'color', 'color-interpolation', 'color-interpolation-filters', 'color-profile',
143 'color-rendering', 'cx', 'cy', 'd', 'dx', 'dy', 'diffuseconstant', 'direction',
144 'display', 'divisor', 'dur', 'edgemode', 'elevation', 'end', 'fill', 'fill-opacity',
145 'fill-rule', 'filter', 'flood-color', 'flood-opacity', 'font-family', 'font-size',
146 'font-size-adjust', 'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
147 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'glyphref', 'gradientunits', 'gradienttransform',
148 'image-rendering', 'in', 'in2', 'k', 'k1', 'k2', 'k3', 'k4', 'kerning', 'keypoints',
149 'keysplines', 'keytimes', 'lengthadjust', 'letter-spacing', 'kernelmatrix',
150 'kernelunitlength', 'lighting-color', 'local', 'marker-end', 'marker-mid',
151 'marker-start', 'markerheight', 'markerunits', 'markerwidth', 'maskcontentunits',
152 'maskunits', 'max', 'mask', 'mode', 'min', 'numoctaves', 'offset', 'operator',
153 'opacity', 'order', 'orient', 'orientation', 'origin', 'overflow', 'paint-order',
154 'path', 'pathlength', 'patterncontentunits', 'patterntransform', 'patternunits',
155 'points', 'preservealpha', 'r', 'rx', 'ry', 'radius', 'refx', 'refy', 'repeatcount',
156 'repeatdur', 'restart', 'rotate', 'scale', 'seed', 'shape-rendering', 'show', 'specularconstant',
157 'specularexponent', 'spreadmethod', 'stddeviation', 'stitchtiles', 'stop-color',
158 'stop-opacity', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
159 'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity', 'stroke', 'stroke-width',
160 'surfacescale', 'targetx', 'targety', 'transform', 'text-anchor', 'text-decoration',
161 'text-rendering', 'textlength', 'to', 'u1', 'u2', 'unicode', 'values', 'viewbox',
162 'visibility', 'vert-adv-y', 'version', 'vert-origin-x', 'vert-origin-y', 'word-spacing',
163 'wrap', 'writing-mode', 'xchannelselector', 'ychannelselector', 'x', 'x1', 'x2',
164 'xmlns', 'y', 'y1', 'y2', 'z', 'zoomandpan',
165 // MathML
166 'accent', 'accentunder', 'bevelled', 'close', 'columnalign', 'columnlines',
167 'columnspan', 'denomalign', 'depth', 'display', 'displaystyle', 'encoding', 'fence',
168 'frame', 'largeop', 'length', 'linethickness', 'lspace', 'lquote',
169 'mathbackground', 'mathcolor', 'mathsize', 'mathvariant', 'maxsize',
170 'minsize', 'movablelimits', 'notation', 'numalign', 'open', 'rowalign',
171 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'rquote', 'scriptlevel',
172 'scriptminsize', 'scriptsizemultiplier', 'selection', 'separator',
173 'separators', 'stretchy', 'subscriptshift', 'supscriptshift', 'symmetric', 'voffset',
174 'fontsize', 'fontweight', 'fontstyle', 'fontfamily', 'groupalign', 'edge', 'side',
175 );
176
177 /* Elements which could be empty and be returned in short form (<tag />) */
178 static $void_elements = array('area', 'base', 'br', 'col', 'command', 'embed', 'hr',
179 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
180 // MathML
181 'sep', 'infinity', 'in', 'plus', 'eq', 'power', 'times', 'divide', 'root',
182 'maligngroup', 'none', 'mprescripts',
183 );
184
185 /* State for linked objects in HTML */
186 public $extlinks = false;
187
188 /* Current settings */
189 private $config = array();
190
191 /* Registered callback functions for tags */
192 private $handlers = array();
193
194 /* Allowed HTML elements */
195 private $_html_elements = array();
196
197 /* Ignore these HTML tags but process their content */
198 private $_ignore_elements = array();
199
200 /* Elements which could be empty and be returned in short form (<tag />) */
201 private $_void_elements = array();
202
203 /* Allowed HTML attributes */
204 private $_html_attribs = array();
205
206 /* Max nesting level */
207 private $max_nesting_level;
208
209 private $is_xml = false;
210
211
212 /**
213 * Class constructor
214 */
215 public function __construct($p = array())
216 {
217 $this->_html_elements = array_flip((array)$p['html_elements']) + array_flip(self::$html_elements);
218 $this->_html_attribs = array_flip((array)$p['html_attribs']) + array_flip(self::$html_attribs);
219 $this->_ignore_elements = array_flip((array)$p['ignore_elements']) + array_flip(self::$ignore_elements);
220 $this->_void_elements = array_flip((array)$p['void_elements']) + array_flip(self::$void_elements);
221
222 unset($p['html_elements'], $p['html_attribs'], $p['ignore_elements'], $p['void_elements']);
223
224 $this->config = $p + array('show_washed' => true, 'allow_remote' => false, 'cid_map' => array());
225 }
226
227 /**
228 * Register a callback function for a certain tag
229 */
230 public function add_callback($tagName, $callback)
231 {
232 $this->handlers[$tagName] = $callback;
233 }
234
235 /**
236 * Check CSS style
237 */
238 private function wash_style($style)
239 {
240 $result = array();
241
242 // Remove unwanted white-space characters so regular expressions below work better
243 $style = preg_replace('/[\n\r\s\t]+/', ' ', $style);
244
245 foreach (explode(';', $style) as $declaration) {
246 if (preg_match('/^\s*([a-z\-]+)\s*:\s*(.*)\s*$/i', $declaration, $match)) {
247 $cssid = $match[1];
248 $str = $match[2];
249 $value = '';
250
251 foreach ($this->explode_style($str) as $val) {
252 if (preg_match('/^url\(/i', $val)) {
253 if (preg_match('/^url\(\s*[\'"]?([^\'"\)]*)[\'"]?\s*\)/iu', $val, $match)) {
254 if ($url = $this->wash_uri($match[1])) {
255 $value .= ' url(' . htmlspecialchars($url, ENT_QUOTES) . ')';
256 }
257 }
258 }
259 else if (!preg_match('/^(behavior|expression)/i', $val)) {
260 // Set position:fixed to position:absolute for security (#5264)
261 if (!strcasecmp($cssid, 'position') && !strcasecmp($val, 'fixed')) {
262 $val = 'absolute';
263 }
264
265 // whitelist ?
266 $value .= ' ' . $val;
267
268 // #1488535: Fix size units, so width:800 would be changed to width:800px
269 if (preg_match('/^(left|right|top|bottom|width|height)/i', $cssid)
270 && preg_match('/^[0-9]+$/', $val)
271 ) {
272 $value .= 'px';
273 }
274 }
275 }
276
277 if (isset($value[0])) {
278 $result[] = $cssid . ':' . $value;
279 }
280 }
281 }
282
283 return implode('; ', $result);
284 }
285
286 /**
287 * Take a node and return allowed attributes and check values
288 */
289 private function wash_attribs($node)
290 {
291 $result = '';
292 $washed = array();
293
294 foreach ($node->attributes as $name => $attr) {
295 $key = strtolower($name);
296 $value = $attr->nodeValue;
297
298 if ($key == 'style' && ($style = $this->wash_style($value))) {
299 // replace double quotes to prevent syntax error and XSS issues (#1490227)
300 $result .= ' style="' . str_replace('"', '&quot;', $style) . '"';
301 }
302 else if (isset($this->_html_attribs[$key])) {
303 $value = trim($value);
304 $out = null;
305
306 // in SVG to/from attribs may contain anything, including URIs
307 if ($key == 'to' || $key == 'from') {
308 $key = strtolower($node->getAttribute('attributeName'));
309 if ($key && !isset($this->_html_attribs[$key])) {
310 $key = null;
311 }
312 }
313
314 if ($this->is_image_attribute($node->nodeName, $key)) {
315 $out = $this->wash_uri($value, true);
316 }
317 else if ($this->is_link_attribute($node->nodeName, $key)) {
318 if (!preg_match('!^(javascript|vbscript|data:text)!i', $value)
319 && preg_match('!^([a-z][a-z0-9.+-]+:|//|#).+!i', $value)
320 ) {
321 $out = $value;
322 }
323 }
324 else if ($this->is_funciri_attribute($node->nodeName, $key)) {
325 if (preg_match('/^[a-z:]*url\(/i', $val)) {
326 if (preg_match('/^([a-z:]*url)\(\s*[\'"]?([^\'"\)]*)[\'"]?\s*\)/iu', $value, $match)) {
327 if ($url = $this->wash_uri($match[2])) {
328 $result .= ' ' . $attr->nodeName . '="' . $match[1] . '(' . htmlspecialchars($url, ENT_QUOTES) . ')'
329 . substr($val, strlen($match[0])) . '"';
330 continue;
331 }
332 }
333 else {
334 $out = $value;
335 }
336 }
337 else {
338 $out = $value;
339 }
340 }
341 else if ($key) {
342 $out = $value;
343 }
344
345 if ($out !== null && $out !== '') {
346 $result .= ' ' . $attr->nodeName . '="' . htmlspecialchars($out, ENT_QUOTES) . '"';
347 }
348 else if ($value) {
349 $washed[] = htmlspecialchars($attr->nodeName, ENT_QUOTES);
350 }
351 }
352 else {
353 $washed[] = htmlspecialchars($attr->nodeName, ENT_QUOTES);
354 }
355 }
356
357 if (!empty($washed) && $this->config['show_washed']) {
358 $result .= ' x-washed="' . implode(' ', $washed) . '"';
359 }
360
361 return $result;
362 }
363
364 /**
365 * Wash URI value
366 */
367 private function wash_uri($uri, $blocked_source = false)
368 {
369 if (($src = $this->config['cid_map'][$uri])
370 || ($src = $this->config['cid_map'][$this->config['base_url'].$uri])
371 ) {
372 return $src;
373 }
374
375 // allow url(#id) used in SVG
376 if ($uri[0] == '#') {
377 return $uri;
378 }
379
380 if (preg_match('/^(http|https|ftp):.+/i', $uri)) {
381 if ($this->config['allow_remote']) {
382 return $uri;
383 }
384
385 $this->extlinks = true;
386 if ($blocked_source && $this->config['blocked_src']) {
387 return $this->config['blocked_src'];
388 }
389 }
390 else if (preg_match('/^data:image.+/i', $uri)) { // RFC2397
391 return $uri;
392 }
393 }
394
395 /**
396 * Check it the tag/attribute may contain an URI
397 */
398 private function is_link_attribute($tag, $attr)
399 {
400 return ($tag == 'a' || $tag == 'area') && $attr == 'href';
401 }
402
403 /**
404 * Check it the tag/attribute may contain an image URI
405 */
406 private function is_image_attribute($tag, $attr)
407 {
408 return $attr == 'background'
409 || $attr == 'color-profile' // SVG
410 || ($attr == 'poster' && $tag == 'video')
411 || ($attr == 'src' && preg_match('/^(img|source|input|video|audio)$/i', $tag))
412 || ($tag == 'image' && $attr == 'href'); // SVG
413 }
414
415 /**
416 * Check it the tag/attribute may contain a FUNCIRI value
417 */
418 private function is_funciri_attribute($tag, $attr)
419 {
420 return in_array($attr, array('fill', 'filter', 'stroke', 'marker-start',
421 'marker-end', 'marker-mid', 'clip-path', 'mask', 'cursor'));
422 }
423
424 /**
425 * The main loop that recurse on a node tree.
426 * It output only allowed tags with allowed attributes and allowed inline styles
427 *
428 * @param DOMNode $node HTML element
429 * @param int $level Recurrence level (safe initial value found empirically)
430 */
431 private function dumpHtml($node, $level = 20)
432 {
433 if (!$node->hasChildNodes()) {
434 return '';
435 }
436
437 $level++;
438
439 if ($this->max_nesting_level > 0 && $level == $this->max_nesting_level - 1) {
440 // log error message once
441 if (!$this->max_nesting_level_error) {
442 $this->max_nesting_level_error = true;
443 rcube::raise_error(array('code' => 500, 'type' => 'php',
444 'line' => __LINE__, 'file' => __FILE__,
445 'message' => "Maximum nesting level exceeded (xdebug.max_nesting_level={$this->max_nesting_level})"),
446 true, false);
447 }
448 return '<!-- ignored -->';
449 }
450
451 $node = $node->firstChild;
452 $dump = '';
453
454 do {
455 switch ($node->nodeType) {
456 case XML_ELEMENT_NODE: //Check element
457 $tagName = strtolower($node->nodeName);
458 if ($callback = $this->handlers[$tagName]) {
459 $dump .= call_user_func($callback, $tagName,
460 $this->wash_attribs($node), $this->dumpHtml($node, $level), $this);
461 }
462 else if (isset($this->_html_elements[$tagName])) {
463 $content = $this->dumpHtml($node, $level);
464 $dump .= '<' . $node->nodeName;
465
466 if ($tagName == 'svg') {
467 $xpath = new DOMXPath($node->ownerDocument);
468 foreach ($xpath->query('namespace::*') as $ns) {
469 if ($ns->nodeName != 'xmlns:xml') {
470 $dump .= ' ' . $ns->nodeName . '="' . $ns->nodeValue . '"';
471 }
472 }
473 }
474 else if ($tagName == 'textarea' && strpos($content, '<') !== false) {
475 $content = htmlspecialchars($content, ENT_QUOTES);
476 }
477
478 $dump .= $this->wash_attribs($node);
479
480 if ($content === '' && ($this->is_xml || isset($this->_void_elements[$tagName]))) {
481 $dump .= ' />';
482 }
483 else {
484 $dump .= '>' . $content . '</' . $node->nodeName . '>';
485 }
486 }
487 else if (isset($this->_ignore_elements[$tagName])) {
488 $dump .= '<!-- ' . htmlspecialchars($node->nodeName, ENT_QUOTES) . ' not allowed -->';
489 }
490 else {
491 $dump .= '<!-- ' . htmlspecialchars($node->nodeName, ENT_QUOTES) . ' ignored -->';
492 $dump .= $this->dumpHtml($node, $level); // ignore tags not its content
493 }
494 break;
495
496 case XML_CDATA_SECTION_NODE:
497 $dump .= $node->nodeValue;
498 break;
499
500 case XML_TEXT_NODE:
501 $dump .= htmlspecialchars($node->nodeValue);
502 break;
503
504 case XML_HTML_DOCUMENT_NODE:
505 $dump .= $this->dumpHtml($node, $level);
506 break;
507 }
508 }
509 while($node = $node->nextSibling);
510
511 return $dump;
512 }
513
514 /**
515 * Main function, give it untrusted HTML, tell it if you allow loading
516 * remote images and give it a map to convert "cid:" urls.
517 */
518 public function wash($html)
519 {
520 // Charset seems to be ignored (probably if defined in the HTML document)
521 $node = new DOMDocument('1.0', $this->config['charset']);
522 $this->extlinks = false;
523
524 $html = $this->cleanup($html);
525
526 // Find base URL for images
527 if (preg_match('/<base\s+href=[\'"]*([^\'"]+)/is', $html, $matches)) {
528 $this->config['base_url'] = $matches[1];
529 }
530 else {
531 $this->config['base_url'] = '';
532 }
533
534 // Detect max nesting level (for dumpHTML) (#1489110)
535 $this->max_nesting_level = (int) @ini_get('xdebug.max_nesting_level');
536
537 // SVG need to be parsed as XML
538 $this->is_xml = stripos($html, '<html') === false && stripos($html, '<svg') !== false;
539 $method = $this->is_xml ? 'loadXML' : 'loadHTML';
540 $options = 0;
541
542 // Use optimizations if supported
543 if (PHP_VERSION_ID >= 50400) {
544 $options = LIBXML_PARSEHUGE | LIBXML_COMPACT | LIBXML_NONET;
545 @$node->{$method}($html, $options);
546 }
547 else {
548 @$node->{$method}($html);
549 }
550
551 return $this->dumpHtml($node);
552 }
553
554 /**
555 * Getter for config parameters
556 */
557 public function get_config($prop)
558 {
559 return $this->config[$prop];
560 }
561
562 /**
563 * Clean HTML input
564 */
565 private function cleanup($html)
566 {
567 $html = trim($html);
568
569 // special replacements (not properly handled by washtml class)
570 $html_search = array(
571 // space(s) between <NOBR>
572 '/(<\/nobr>)(\s+)(<nobr>)/i',
573 // PHP bug #32547 workaround: remove title tag
574 '/<title[^>]*>[^<]*<\/title>/i',
575 // remove <!doctype> before BOM (#1490291)
576 '/<\!doctype[^>]+>[^<]*/im',
577 // byte-order mark (only outlook?)
578 '/^(\0\0\xFE\xFF|\xFF\xFE\0\0|\xFE\xFF|\xFF\xFE|\xEF\xBB\xBF)/',
579 // washtml/DOMDocument cannot handle xml namespaces
580 '/<html\s[^>]+>/i',
581 );
582
583 $html_replace = array(
584 '\\1'.' &nbsp; '.'\\3',
585 '',
586 '',
587 '',
588 '<html>',
589 );
590
591 $html = preg_replace($html_search, $html_replace, trim($html));
592
593 // Replace all of those weird MS Word quotes and other high characters
594 $badwordchars = array(
595 "\xe2\x80\x98", // left single quote
596 "\xe2\x80\x99", // right single quote
597 "\xe2\x80\x9c", // left double quote
598 "\xe2\x80\x9d", // right double quote
599 "\xe2\x80\x94", // em dash
600 "\xe2\x80\xa6" // elipses
601 );
602
603 $fixedwordchars = array(
604 "'",
605 "'",
606 '"',
607 '"',
608 '&mdash;',
609 '...'
610 );
611
612 $html = str_replace($badwordchars, $fixedwordchars, $html);
613
614 // PCRE errors handling (#1486856), should we use something like for every preg_* use?
615 if ($html === null && ($preg_error = preg_last_error()) != PREG_NO_ERROR) {
616 $errstr = "Could not clean up HTML message! PCRE Error: $preg_error.";
617
618 if ($preg_error == PREG_BACKTRACK_LIMIT_ERROR) {
619 $errstr .= " Consider raising pcre.backtrack_limit!";
620 }
621 if ($preg_error == PREG_RECURSION_LIMIT_ERROR) {
622 $errstr .= " Consider raising pcre.recursion_limit!";
623 }
624
625 rcube::raise_error(array('code' => 620, 'type' => 'php',
626 'line' => __LINE__, 'file' => __FILE__,
627 'message' => $errstr), true, false);
628
629 return '';
630 }
631
632 // fix (unknown/malformed) HTML tags before "wash"
633 $html = preg_replace_callback('/(<(?!\!)[\/]*)([^\s>]+)([^>]*)/', array($this, 'html_tag_callback'), $html);
634
635 // Remove invalid HTML comments (#1487759)
636 // Don't remove valid conditional comments
637 // Don't remove MSOutlook (<!-->) conditional comments (#1489004)
638 $html = preg_replace('/<!--[^-<>\[\n]+>/', '', $html);
639
640 // fix broken nested lists
641 self::fix_broken_lists($html);
642
643 // turn relative into absolute urls
644 $html = self::resolve_base($html);
645
646 return $html;
647 }
648
649 /**
650 * Callback function for HTML tags fixing
651 */
652 public static function html_tag_callback($matches)
653 {
654 $tagname = $matches[2];
655 $tagname = preg_replace(array(
656 '/:.*$/', // Microsoft's Smart Tags <st1:xxxx>
657 '/[^a-z0-9_\[\]\!?-]/i', // forbidden characters
658 ), '', $tagname);
659
660 // fix invalid closing tags - remove any attributes (#1489446)
661 if ($matches[1] == '</') {
662 $matches[3] = '';
663 }
664
665 return $matches[1] . $tagname . $matches[3];
666 }
667
668 /**
669 * Convert all relative URLs according to a <base> in HTML
670 */
671 public static function resolve_base($body)
672 {
673 // check for <base href=...>
674 if (preg_match('!(<base.*href=["\']?)([hftps]{3,5}://[a-z0-9/.%-]+)!i', $body, $regs)) {
675 $replacer = new rcube_base_replacer($regs[2]);
676 $body = $replacer->replace($body);
677 }
678
679 return $body;
680 }
681
682 /**
683 * Fix broken nested lists, they are not handled properly by DOMDocument (#1488768)
684 */
685 public static function fix_broken_lists(&$html)
686 {
687 // do two rounds, one for <ol>, one for <ul>
688 foreach (array('ol', 'ul') as $tag) {
689 $pos = 0;
690 while (($pos = stripos($html, '<' . $tag, $pos)) !== false) {
691 $pos++;
692
693 // make sure this is an ol/ul tag
694 if (!in_array($html[$pos+2], array(' ', '>'))) {
695 continue;
696 }
697
698 $p = $pos;
699 $in_li = false;
700 $li_pos = 0;
701
702 while (($p = strpos($html, '<', $p)) !== false) {
703 $tt = strtolower(substr($html, $p, 4));
704
705 // li open tag
706 if ($tt == '<li>' || $tt == '<li ') {
707 $in_li = true;
708 $p += 4;
709 }
710 // li close tag
711 else if ($tt == '</li' && in_array($html[$p+4], array(' ', '>'))) {
712 $li_pos = $p;
713 $p += 4;
714 $in_li = false;
715 }
716 // ul/ol closing tag
717 else if ($tt == '</' . $tag && in_array($html[$p+4], array(' ', '>'))) {
718 break;
719 }
720 // nested ol/ul element out of li
721 else if (!$in_li && $li_pos && ($tt == '<ol>' || $tt == '<ol ' || $tt == '<ul>' || $tt == '<ul ')) {
722 // find closing tag of this ul/ol element
723 $element = substr($tt, 1, 2);
724 $cpos = $p;
725 do {
726 $tpos = stripos($html, '<' . $element, $cpos+1);
727 $cpos = stripos($html, '</' . $element, $cpos+1);
728 }
729 while ($tpos !== false && $cpos !== false && $cpos > $tpos);
730
731 // not found, this is invalid HTML, skip it
732 if ($cpos === false) {
733 break;
734 }
735
736 // get element content
737 $end = strpos($html, '>', $cpos);
738 $len = $end - $p + 1;
739 $element = substr($html, $p, $len);
740
741 // move element to the end of the last li
742 $html = substr_replace($html, '', $p, $len);
743 $html = substr_replace($html, $element, $li_pos, 0);
744
745 $p = $end;
746 }
747 else {
748 $p++;
749 }
750 }
751 }
752 }
753 }
754
755 /**
756 * Explode css style value
757 */
758 protected function explode_style($style)
759 {
760 $pos = 0;
761
762 // first remove comments
763 while (($pos = strpos($style, '/*', $pos)) !== false) {
764 $end = strpos($style, '*/', $pos+2);
765
766 if ($end === false) {
767 $style = substr($style, 0, $pos);
768 }
769 else {
770 $style = substr_replace($style, '', $pos, $end - $pos + 2);
771 }
772 }
773
774 $style = trim($style);
775 $strlen = strlen($style);
776 $result = array();
777
778 // explode value
779 for ($p=$i=0; $i < $strlen; $i++) {
780 if (($style[$i] == "\"" || $style[$i] == "'") && $style[$i-1] != "\\") {
781 if ($q == $style[$i]) {
782 $q = false;
783 }
784 else if (!$q) {
785 $q = $style[$i];
786 }
787 }
788
789 if (!$q && $style[$i] == ' ' && !preg_match('/[,\(]/', $style[$i-1])) {
790 $result[] = substr($style, $p, $i - $p);
791 $p = $i + 1;
792 }
793 }
794
795 $result[] = (string) substr($style, $p);
796
797 return $result;
798 }
799 }