0
|
1 <?php
|
|
2
|
|
3 /**
|
|
4 +-----------------------------------------------------------------------+
|
|
5 | This file is part of the Roundcube Webmail client |
|
|
6 | Copyright (C) 2008-2012, The Roundcube Dev Team |
|
|
7 | Copyright (c) 2005-2007, Jon Abernathy <jon@chuggnutt.com> |
|
|
8 | |
|
|
9 | Licensed under the GNU General Public License version 3 or |
|
|
10 | any later version with exceptions for skins & plugins. |
|
|
11 | See the README file for a full license statement. |
|
|
12 | |
|
|
13 | PURPOSE: |
|
|
14 | Converts HTML to formatted plain text (based on html2text class) |
|
|
15 +-----------------------------------------------------------------------+
|
|
16 | Author: Thomas Bruederli <roundcube@gmail.com> |
|
|
17 | Author: Aleksander Machniak <alec@alec.pl> |
|
|
18 | Author: Jon Abernathy <jon@chuggnutt.com> |
|
|
19 +-----------------------------------------------------------------------+
|
|
20 */
|
|
21
|
|
22 /**
|
|
23 * Takes HTML and converts it to formatted, plain text.
|
|
24 *
|
|
25 * Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and
|
|
26 * correcting an error in the regexp search array. Fixed 7/30/03.
|
|
27 *
|
|
28 * Updated set_html() function's file reading mechanism, 9/25/03.
|
|
29 *
|
|
30 * Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding
|
|
31 * several more HTML entity codes to the $search and $replace arrays.
|
|
32 * Updated 11/7/03.
|
|
33 *
|
|
34 * Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for
|
|
35 * suggesting the addition of $allowed_tags and its supporting function
|
|
36 * (which I slightly modified). Updated 3/12/04.
|
|
37 *
|
|
38 * Thanks to Justin Dearing for pointing out that a replacement for the
|
|
39 * <TH> tag was missing, and suggesting an appropriate fix.
|
|
40 * Updated 8/25/04.
|
|
41 *
|
|
42 * Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a
|
|
43 * display/formatting bug in the _build_link_list() function: email
|
|
44 * readers would show the left bracket and number ("[1") as part of the
|
|
45 * rendered email address.
|
|
46 * Updated 12/16/04.
|
|
47 *
|
|
48 * Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code
|
|
49 * to handle relative links, which I hadn't considered. I modified his
|
|
50 * code a bit to handle normal HTTP links and MAILTO links. Also for
|
|
51 * suggesting three additional HTML entity codes to search for.
|
|
52 * Updated 03/02/05.
|
|
53 *
|
|
54 * Thanks to Jacob Chandler for pointing out another link condition
|
|
55 * for the _build_link_list() function: "https".
|
|
56 * Updated 04/06/05.
|
|
57 *
|
|
58 * Thanks to Marc Bertrand (http://www.dresdensky.com/) for
|
|
59 * suggesting a revision to the word wrapping functionality; if you
|
|
60 * specify a $width of 0 or less, word wrapping will be ignored.
|
|
61 * Updated 11/02/06.
|
|
62 *
|
|
63 * *** Big housecleaning updates below:
|
|
64 *
|
|
65 * Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for
|
|
66 * suggesting the fix to handle </li> and blank lines (whitespace).
|
|
67 * Christian Basedau (http://www.movetheweb.de/) also suggested the
|
|
68 * blank lines fix.
|
|
69 *
|
|
70 * Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/),
|
|
71 * Christian Basedau, Norbert Laposa (http://ln5.co.uk/),
|
|
72 * Bas van de Weijer, and Marijn van Butselaar
|
|
73 * for pointing out my glaring error in the <th> handling. Marcus also
|
|
74 * supplied a host of fixes.
|
|
75 *
|
|
76 * Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing
|
|
77 * out that extra spaces should be compressed--a problem addressed with
|
|
78 * Marcus Bointon's fixes but that I had not yet incorporated.
|
|
79 *
|
|
80 * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for
|
|
81 * suggesting a valuable fix with <a> tag handling.
|
|
82 *
|
|
83 * Thanks to Wojciech Bajon (again!) for suggesting fixes and additions,
|
|
84 * including the <a> tag handling that Daniel Schledermann pointed
|
|
85 * out but that I had not yet incorporated. I haven't (yet)
|
|
86 * incorporated all of Wojciech's changes, though I may at some
|
|
87 * future time.
|
|
88 *
|
|
89 * *** End of the housecleaning updates. Updated 08/08/07.
|
|
90 */
|
|
91
|
|
92 /**
|
|
93 * Converts HTML to formatted plain text
|
|
94 *
|
|
95 * @package Framework
|
|
96 * @subpackage Utils
|
|
97 */
|
|
98 class rcube_html2text
|
|
99 {
|
|
100 /**
|
|
101 * Contains the HTML content to convert.
|
|
102 *
|
|
103 * @var string $html
|
|
104 */
|
|
105 protected $html;
|
|
106
|
|
107 /**
|
|
108 * Contains the converted, formatted text.
|
|
109 *
|
|
110 * @var string $text
|
|
111 */
|
|
112 protected $text;
|
|
113
|
|
114 /**
|
|
115 * Maximum width of the formatted text, in columns.
|
|
116 *
|
|
117 * Set this value to 0 (or less) to ignore word wrapping
|
|
118 * and not constrain text to a fixed-width column.
|
|
119 *
|
|
120 * @var integer $width
|
|
121 */
|
|
122 protected $width = 70;
|
|
123
|
|
124 /**
|
|
125 * Target character encoding for output text
|
|
126 *
|
|
127 * @var string $charset
|
|
128 */
|
|
129 protected $charset = 'UTF-8';
|
|
130
|
|
131 /**
|
|
132 * List of preg* regular expression patterns to search for,
|
|
133 * used in conjunction with $replace.
|
|
134 *
|
|
135 * @var array $search
|
|
136 * @see $replace
|
|
137 */
|
|
138 protected $search = array(
|
|
139 '/\r/', // Non-legal carriage return
|
|
140 '/^.*<body[^>]*>\n*/is', // Anything before <body>
|
|
141 '/<head[^>]*>.*?<\/head>/is', // <head>
|
|
142 '/<script[^>]*>.*?<\/script>/is', // <script>
|
|
143 '/<style[^>]*>.*?<\/style>/is', // <style>
|
|
144 '/[\n\t]+/', // Newlines and tabs
|
|
145 '/<p[^>]*>/i', // <p>
|
|
146 '/<\/p>[\s\n\t]*<div[^>]*>/i', // </p> before <div>
|
|
147 '/<br[^>]*>[\s\n\t]*<div[^>]*>/i', // <br> before <div>
|
|
148 '/<br[^>]*>\s*/i', // <br>
|
|
149 '/<i[^>]*>(.*?)<\/i>/i', // <i>
|
|
150 '/<em[^>]*>(.*?)<\/em>/i', // <em>
|
|
151 '/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul>
|
|
152 '/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol>
|
|
153 '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li>
|
|
154 '/<li[^>]*>/i', // <li>
|
|
155 '/<hr[^>]*>/i', // <hr>
|
|
156 '/<div[^>]*>/i', // <div>
|
|
157 '/(<table[^>]*>|<\/table>)/i', // <table> and </table>
|
|
158 '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr>
|
|
159 '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td>
|
|
160 );
|
|
161
|
|
162 /**
|
|
163 * List of pattern replacements corresponding to patterns searched.
|
|
164 *
|
|
165 * @var array $replace
|
|
166 * @see $search
|
|
167 */
|
|
168 protected $replace = array(
|
|
169 '', // Non-legal carriage return
|
|
170 '', // Anything before <body>
|
|
171 '', // <head>
|
|
172 '', // <script>
|
|
173 '', // <style>
|
|
174 ' ', // Newlines and tabs
|
|
175 "\n\n", // <p>
|
|
176 "\n<div>", // </p> before <div>
|
|
177 '<div>', // <br> before <div>
|
|
178 "\n", // <br>
|
|
179 '_\\1_', // <i>
|
|
180 '_\\1_', // <em>
|
|
181 "\n\n", // <ul> and </ul>
|
|
182 "\n\n", // <ol> and </ol>
|
|
183 "\t* \\1\n", // <li> and </li>
|
|
184 "\n\t* ", // <li>
|
|
185 "\n-------------------------\n", // <hr>
|
|
186 "<div>\n", // <div>
|
|
187 "\n\n", // <table> and </table>
|
|
188 "\n", // <tr> and </tr>
|
|
189 "\t\t\\1\n", // <td> and </td>
|
|
190 );
|
|
191
|
|
192 /**
|
|
193 * List of preg* regular expression patterns to search for,
|
|
194 * used in conjunction with $ent_replace.
|
|
195 *
|
|
196 * @var array $ent_search
|
|
197 * @see $ent_replace
|
|
198 */
|
|
199 protected $ent_search = array(
|
|
200 '/&(nbsp|#160);/i', // Non-breaking space
|
|
201 '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i',
|
|
202 // Double quotes
|
|
203 '/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes
|
|
204 '/>/i', // Greater-than
|
|
205 '/</i', // Less-than
|
|
206 '/&(copy|#169);/i', // Copyright
|
|
207 '/&(trade|#8482|#153);/i', // Trademark
|
|
208 '/&(reg|#174);/i', // Registered
|
|
209 '/&(mdash|#151|#8212);/i', // mdash
|
|
210 '/&(ndash|minus|#8211|#8722);/i', // ndash
|
|
211 '/&(bull|#149|#8226);/i', // Bullet
|
|
212 '/&(pound|#163);/i', // Pound sign
|
|
213 '/&(euro|#8364);/i', // Euro sign
|
|
214 '/&(amp|#38);/i', // Ampersand: see _converter()
|
|
215 '/[ ]{2,}/', // Runs of spaces, post-handling
|
|
216 );
|
|
217
|
|
218 /**
|
|
219 * List of pattern replacements corresponding to patterns searched.
|
|
220 *
|
|
221 * @var array $ent_replace
|
|
222 * @see $ent_search
|
|
223 */
|
|
224 protected $ent_replace = array(
|
|
225 "\xC2\xA0", // Non-breaking space
|
|
226 '"', // Double quotes
|
|
227 "'", // Single quotes
|
|
228 '>',
|
|
229 '<',
|
|
230 '(c)',
|
|
231 '(tm)',
|
|
232 '(R)',
|
|
233 '--',
|
|
234 '-',
|
|
235 '*',
|
|
236 '£',
|
|
237 'EUR', // Euro sign. €
|
|
238 '|+|amp|+|', // Ampersand: see _converter()
|
|
239 ' ', // Runs of spaces, post-handling
|
|
240 );
|
|
241
|
|
242 /**
|
|
243 * List of preg* regular expression patterns to search for
|
|
244 * and replace using callback function.
|
|
245 *
|
|
246 * @var array $callback_search
|
|
247 */
|
|
248 protected $callback_search = array(
|
|
249 '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', // <a href="">
|
|
250 '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6
|
|
251 '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b>
|
|
252 '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong>
|
|
253 '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th>
|
|
254 );
|
|
255
|
|
256 /**
|
|
257 * List of preg* regular expression patterns to search for in PRE body,
|
|
258 * used in conjunction with $pre_replace.
|
|
259 *
|
|
260 * @var array $pre_search
|
|
261 * @see $pre_replace
|
|
262 */
|
|
263 protected $pre_search = array(
|
|
264 "/\n/",
|
|
265 "/\t/",
|
|
266 '/ /',
|
|
267 '/<pre[^>]*>/',
|
|
268 '/<\/pre>/'
|
|
269 );
|
|
270
|
|
271 /**
|
|
272 * List of pattern replacements corresponding to patterns searched for PRE body.
|
|
273 *
|
|
274 * @var array $pre_replace
|
|
275 * @see $pre_search
|
|
276 */
|
|
277 protected $pre_replace = array(
|
|
278 '<br>',
|
|
279 ' ',
|
|
280 ' ',
|
|
281 '',
|
|
282 ''
|
|
283 );
|
|
284
|
|
285 /**
|
|
286 * Contains a list of HTML tags to allow in the resulting text.
|
|
287 *
|
|
288 * @var string $allowed_tags
|
|
289 * @see set_allowed_tags()
|
|
290 */
|
|
291 protected $allowed_tags = '';
|
|
292
|
|
293 /**
|
|
294 * Contains the base URL that relative links should resolve to.
|
|
295 *
|
|
296 * @var string $url
|
|
297 */
|
|
298 protected $url;
|
|
299
|
|
300 /**
|
|
301 * Indicates whether content in the $html variable has been converted yet.
|
|
302 *
|
|
303 * @var boolean $_converted
|
|
304 * @see $html, $text
|
|
305 */
|
|
306 protected $_converted = false;
|
|
307
|
|
308 /**
|
|
309 * Contains URL addresses from links to be rendered in plain text.
|
|
310 *
|
|
311 * @var array $_link_list
|
|
312 * @see _build_link_list()
|
|
313 */
|
|
314 protected $_link_list = array();
|
|
315
|
|
316 /**
|
|
317 * Boolean flag, true if a table of link URLs should be listed after the text.
|
|
318 *
|
|
319 * @var boolean $_do_links
|
|
320 * @see __construct()
|
|
321 */
|
|
322 protected $_do_links = true;
|
|
323
|
|
324 /**
|
|
325 * Constructor.
|
|
326 *
|
|
327 * If the HTML source string (or file) is supplied, the class
|
|
328 * will instantiate with that source propagated, all that has
|
|
329 * to be done it to call get_text().
|
|
330 *
|
|
331 * @param string $source HTML content
|
|
332 * @param boolean $from_file Indicates $source is a file to pull content from
|
|
333 * @param boolean $do_links Indicate whether a table of link URLs is desired
|
|
334 * @param integer $width Maximum width of the formatted text, 0 for no limit
|
|
335 */
|
|
336 function __construct($source = '', $from_file = false, $do_links = true, $width = 75, $charset = 'UTF-8')
|
|
337 {
|
|
338 if (!empty($source)) {
|
|
339 $this->set_html($source, $from_file);
|
|
340 }
|
|
341
|
|
342 $this->set_base_url();
|
|
343
|
|
344 $this->_do_links = $do_links;
|
|
345 $this->width = $width;
|
|
346 $this->charset = $charset;
|
|
347 }
|
|
348
|
|
349 /**
|
|
350 * Loads source HTML into memory, either from $source string or a file.
|
|
351 *
|
|
352 * @param string $source HTML content
|
|
353 * @param boolean $from_file Indicates $source is a file to pull content from
|
|
354 */
|
|
355 function set_html($source, $from_file = false)
|
|
356 {
|
|
357 if ($from_file && file_exists($source)) {
|
|
358 $this->html = file_get_contents($source);
|
|
359 }
|
|
360 else {
|
|
361 $this->html = $source;
|
|
362 }
|
|
363
|
|
364 $this->_converted = false;
|
|
365 }
|
|
366
|
|
367 /**
|
|
368 * Returns the text, converted from HTML.
|
|
369 *
|
|
370 * @return string Plain text
|
|
371 */
|
|
372 function get_text()
|
|
373 {
|
|
374 if (!$this->_converted) {
|
|
375 $this->_convert();
|
|
376 }
|
|
377
|
|
378 return $this->text;
|
|
379 }
|
|
380
|
|
381 /**
|
|
382 * Prints the text, converted from HTML.
|
|
383 */
|
|
384 function print_text()
|
|
385 {
|
|
386 print $this->get_text();
|
|
387 }
|
|
388
|
|
389 /**
|
|
390 * Sets the allowed HTML tags to pass through to the resulting text.
|
|
391 *
|
|
392 * Tags should be in the form "<p>", with no corresponding closing tag.
|
|
393 */
|
|
394 function set_allowed_tags($allowed_tags = '')
|
|
395 {
|
|
396 if (!empty($allowed_tags)) {
|
|
397 $this->allowed_tags = $allowed_tags;
|
|
398 }
|
|
399 }
|
|
400
|
|
401 /**
|
|
402 * Sets a base URL to handle relative links.
|
|
403 */
|
|
404 function set_base_url($url = '')
|
|
405 {
|
|
406 if (empty($url)) {
|
|
407 if (!empty($_SERVER['HTTP_HOST'])) {
|
|
408 $this->url = 'http://' . $_SERVER['HTTP_HOST'];
|
|
409 }
|
|
410 else {
|
|
411 $this->url = '';
|
|
412 }
|
|
413 }
|
|
414 else {
|
|
415 // Strip any trailing slashes for consistency (relative
|
|
416 // URLs may already start with a slash like "/file.html")
|
|
417 if (substr($url, -1) == '/') {
|
|
418 $url = substr($url, 0, -1);
|
|
419 }
|
|
420 $this->url = $url;
|
|
421 }
|
|
422 }
|
|
423
|
|
424 /**
|
|
425 * Workhorse function that does actual conversion (calls _converter() method).
|
|
426 */
|
|
427 protected function _convert()
|
|
428 {
|
|
429 // Variables used for building the link list
|
|
430 $this->_link_list = array();
|
|
431
|
|
432 $text = $this->html;
|
|
433
|
|
434 // Convert HTML to TXT
|
|
435 $this->_converter($text);
|
|
436
|
|
437 // Add link list
|
|
438 if (!empty($this->_link_list)) {
|
|
439 $text .= "\n\nLinks:\n------\n";
|
|
440 foreach ($this->_link_list as $idx => $url) {
|
|
441 $text .= '[' . ($idx+1) . '] ' . $url . "\n";
|
|
442 }
|
|
443 }
|
|
444
|
|
445 $this->text = $text;
|
|
446 $this->_converted = true;
|
|
447 }
|
|
448
|
|
449 /**
|
|
450 * Workhorse function that does actual conversion.
|
|
451 *
|
|
452 * First performs custom tag replacement specified by $search and
|
|
453 * $replace arrays. Then strips any remaining HTML tags, reduces whitespace
|
|
454 * and newlines to a readable format, and word wraps the text to
|
|
455 * $width characters.
|
|
456 *
|
|
457 * @param string &$text Reference to HTML content string
|
|
458 */
|
|
459 protected function _converter(&$text)
|
|
460 {
|
|
461 // Convert <BLOCKQUOTE> (before PRE!)
|
|
462 $this->_convert_blockquotes($text);
|
|
463
|
|
464 // Convert <PRE>
|
|
465 $this->_convert_pre($text);
|
|
466
|
|
467 // Run our defined tags search-and-replace
|
|
468 $text = preg_replace($this->search, $this->replace, $text);
|
|
469
|
|
470 // Run our defined tags search-and-replace with callback
|
|
471 $text = preg_replace_callback($this->callback_search, array($this, 'tags_preg_callback'), $text);
|
|
472
|
|
473 // Strip any other HTML tags
|
|
474 $text = strip_tags($text, $this->allowed_tags);
|
|
475
|
|
476 // Run our defined entities/characters search-and-replace
|
|
477 $text = preg_replace($this->ent_search, $this->ent_replace, $text);
|
|
478
|
|
479 // Replace known html entities
|
|
480 $text = html_entity_decode($text, ENT_QUOTES, $this->charset);
|
|
481
|
|
482 // Replace unicode nbsp to regular spaces
|
|
483 $text = preg_replace('/\xC2\xA0/', ' ', $text);
|
|
484
|
|
485 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
|
|
486 $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
|
|
487
|
|
488 // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
|
|
489 // This properly handles situation of "&quot;" in input string
|
|
490 $text = str_replace('|+|amp|+|', '&', $text);
|
|
491
|
|
492 // Bring down number of empty lines to 2 max
|
|
493 $text = preg_replace("/\n\s+\n/", "\n\n", $text);
|
|
494 $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
|
|
495
|
|
496 // remove leading empty lines (can be produced by eg. P tag on the beginning)
|
|
497 $text = ltrim($text, "\n");
|
|
498
|
|
499 // Wrap the text to a readable format
|
|
500 // for PHP versions >= 4.0.2. Default width is 75
|
|
501 // If width is 0 or less, don't wrap the text.
|
|
502 if ( $this->width > 0 ) {
|
|
503 $text = wordwrap($text, $this->width);
|
|
504 }
|
|
505 }
|
|
506
|
|
507 /**
|
|
508 * Helper function called by preg_replace() on link replacement.
|
|
509 *
|
|
510 * Maintains an internal list of links to be displayed at the end of the
|
|
511 * text, with numeric indices to the original point in the text they
|
|
512 * appeared. Also makes an effort at identifying and handling absolute
|
|
513 * and relative links.
|
|
514 *
|
|
515 * @param string $link URL of the link
|
|
516 * @param string $display Part of the text to associate number with
|
|
517 */
|
|
518 protected function _build_link_list($link, $display)
|
|
519 {
|
|
520 if (!$this->_do_links || empty($link)) {
|
|
521 return $display;
|
|
522 }
|
|
523
|
|
524 // Ignored link types
|
|
525 if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
|
|
526 return $display;
|
|
527 }
|
|
528
|
|
529 // skip links with href == content (#1490434)
|
|
530 if ($link === $display) {
|
|
531 return $display;
|
|
532 }
|
|
533
|
|
534 if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
|
|
535 $url = $link;
|
|
536 }
|
|
537 else {
|
|
538 $url = $this->url;
|
|
539 if (substr($link, 0, 1) != '/') {
|
|
540 $url .= '/';
|
|
541 }
|
|
542 $url .= "$link";
|
|
543 }
|
|
544
|
|
545 if (($index = array_search($url, $this->_link_list)) === false) {
|
|
546 $index = count($this->_link_list);
|
|
547 $this->_link_list[] = $url;
|
|
548 }
|
|
549
|
|
550 return $display . ' [' . ($index+1) . ']';
|
|
551 }
|
|
552
|
|
553 /**
|
|
554 * Helper function for PRE body conversion.
|
|
555 *
|
|
556 * @param string &$text HTML content
|
|
557 */
|
|
558 protected function _convert_pre(&$text)
|
|
559 {
|
|
560 // get the content of PRE element
|
|
561 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
|
|
562 $this->pre_content = $matches[1];
|
|
563
|
|
564 // Run our defined tags search-and-replace with callback
|
|
565 $this->pre_content = preg_replace_callback($this->callback_search,
|
|
566 array($this, 'tags_preg_callback'), $this->pre_content);
|
|
567
|
|
568 // convert the content
|
|
569 $this->pre_content = sprintf('<div><br>%s<br></div>',
|
|
570 preg_replace($this->pre_search, $this->pre_replace, $this->pre_content));
|
|
571
|
|
572 // replace the content (use callback because content can contain $0 variable)
|
|
573 $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU',
|
|
574 array($this, 'pre_preg_callback'), $text, 1);
|
|
575
|
|
576 // free memory
|
|
577 $this->pre_content = '';
|
|
578 }
|
|
579 }
|
|
580
|
|
581 /**
|
|
582 * Helper function for BLOCKQUOTE body conversion.
|
|
583 *
|
|
584 * @param string &$text HTML content
|
|
585 */
|
|
586 protected function _convert_blockquotes(&$text)
|
|
587 {
|
|
588 $level = 0;
|
|
589 $offset = 0;
|
|
590 while (($start = stripos($text, '<blockquote', $offset)) !== false) {
|
|
591 $offset = $start + 12;
|
|
592 do {
|
|
593 $end = stripos($text, '</blockquote>', $offset);
|
|
594 $next = stripos($text, '<blockquote', $offset);
|
|
595
|
|
596 // nested <blockquote>, skip
|
|
597 if ($next !== false && $next < $end) {
|
|
598 $offset = $next + 12;
|
|
599 $level++;
|
|
600 }
|
|
601 // nested </blockquote> tag
|
|
602 if ($end !== false && $level > 0) {
|
|
603 $offset = $end + 12;
|
|
604 $level--;
|
|
605 }
|
|
606 // found matching end tag
|
|
607 else if ($end !== false && $level == 0) {
|
|
608 $taglen = strpos($text, '>', $start) - $start;
|
|
609 $startpos = $start + $taglen + 1;
|
|
610
|
|
611 // get blockquote content
|
|
612 $body = trim(substr($text, $startpos, $end - $startpos));
|
|
613
|
|
614 // adjust text wrapping width
|
|
615 $p_width = $this->width;
|
|
616 if ($this->width > 0) $this->width -= 2;
|
|
617
|
|
618 // replace content with inner blockquotes
|
|
619 $this->_converter($body);
|
|
620
|
|
621 // resore text width
|
|
622 $this->width = $p_width;
|
|
623
|
|
624 // Add citation markers and create <pre> block
|
|
625 $body = preg_replace_callback('/((?:^|\n)>*)([^\n]*)/', array($this, 'blockquote_citation_callback'), trim($body));
|
|
626 $body = '<pre>' . htmlspecialchars($body) . '</pre>';
|
|
627
|
|
628 $text = substr_replace($text, $body . "\n", $start, $end + 13 - $start);
|
|
629 $offset = 0;
|
|
630
|
|
631 break;
|
|
632 }
|
|
633 // abort on invalid tag structure (e.g. no closing tag found)
|
|
634 else {
|
|
635 break;
|
|
636 }
|
|
637 }
|
|
638 while ($end || $next);
|
|
639 }
|
|
640 }
|
|
641
|
|
642 /**
|
|
643 * Callback function to correctly add citation markers for blockquote contents
|
|
644 */
|
|
645 public function blockquote_citation_callback($m)
|
|
646 {
|
|
647 $line = ltrim($m[2]);
|
|
648 $space = $line[0] == '>' ? '' : ' ';
|
|
649
|
|
650 return $m[1] . '>' . $space . $line;
|
|
651 }
|
|
652
|
|
653 /**
|
|
654 * Callback function for preg_replace_callback use.
|
|
655 *
|
|
656 * @param array $matches PREG matches
|
|
657 * @return string
|
|
658 */
|
|
659 public function tags_preg_callback($matches)
|
|
660 {
|
|
661 switch (strtolower($matches[1])) {
|
|
662 case 'b':
|
|
663 case 'strong':
|
|
664 return $this->_toupper($matches[3]);
|
|
665 case 'th':
|
|
666 return $this->_toupper("\t\t". $matches[3] ."\n");
|
|
667 case 'h':
|
|
668 return $this->_toupper("\n\n". $matches[3] ."\n\n");
|
|
669 case 'a':
|
|
670 // Remove spaces in URL (#1487805)
|
|
671 $url = str_replace(' ', '', $matches[3]);
|
|
672 return $this->_build_link_list($url, $matches[4]);
|
|
673 }
|
|
674 }
|
|
675
|
|
676 /**
|
|
677 * Callback function for preg_replace_callback use in PRE content handler.
|
|
678 *
|
|
679 * @param array $matches PREG matches
|
|
680 * @return string
|
|
681 */
|
|
682 public function pre_preg_callback($matches)
|
|
683 {
|
|
684 return $this->pre_content;
|
|
685 }
|
|
686
|
|
687 /**
|
|
688 * Strtoupper function with HTML tags and entities handling.
|
|
689 *
|
|
690 * @param string $str Text to convert
|
|
691 * @return string Converted text
|
|
692 */
|
|
693 private function _toupper($str)
|
|
694 {
|
|
695 // string can containing HTML tags
|
|
696 $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
|
|
697
|
|
698 // convert toupper only the text between HTML tags
|
|
699 foreach ($chunks as $idx => $chunk) {
|
|
700 if ($chunk[0] != '<') {
|
|
701 $chunks[$idx] = $this->_strtoupper($chunk);
|
|
702 }
|
|
703 }
|
|
704
|
|
705 return implode($chunks);
|
|
706 }
|
|
707
|
|
708 /**
|
|
709 * Strtoupper multibyte wrapper function with HTML entities handling.
|
|
710 *
|
|
711 * @param string $str Text to convert
|
|
712 * @return string Converted text
|
|
713 */
|
|
714 private function _strtoupper($str)
|
|
715 {
|
|
716 $str = html_entity_decode($str, ENT_COMPAT, $this->charset);
|
|
717 $str = mb_strtoupper($str);
|
|
718 $str = htmlspecialchars($str, ENT_COMPAT, $this->charset);
|
|
719
|
|
720 return $str;
|
|
721 }
|
|
722 }
|