Mercurial > hg > rc2
comparison program/lib/Roundcube/rcube_html2text.php @ 0:4681f974d28b
vanilla 1.3.3 distro, I hope
author | Charlie Root |
---|---|
date | Thu, 04 Jan 2018 15:52:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4681f974d28b |
---|---|
1 <?php | |
2 | |
3 /** | |
4 +-----------------------------------------------------------------------+ | |
5 | This file is part of the Roundcube Webmail client | | |
6 | Copyright (C) 2008-2012, The Roundcube Dev Team | | |
7 | Copyright (c) 2005-2007, Jon Abernathy <jon@chuggnutt.com> | | |
8 | | | |
9 | Licensed under the GNU General Public License version 3 or | | |
10 | any later version with exceptions for skins & plugins. | | |
11 | See the README file for a full license statement. | | |
12 | | | |
13 | PURPOSE: | | |
14 | Converts HTML to formatted plain text (based on html2text class) | | |
15 +-----------------------------------------------------------------------+ | |
16 | Author: Thomas Bruederli <roundcube@gmail.com> | | |
17 | Author: Aleksander Machniak <alec@alec.pl> | | |
18 | Author: Jon Abernathy <jon@chuggnutt.com> | | |
19 +-----------------------------------------------------------------------+ | |
20 */ | |
21 | |
22 /** | |
23 * Takes HTML and converts it to formatted, plain text. | |
24 * | |
25 * Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and | |
26 * correcting an error in the regexp search array. Fixed 7/30/03. | |
27 * | |
28 * Updated set_html() function's file reading mechanism, 9/25/03. | |
29 * | |
30 * Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding | |
31 * several more HTML entity codes to the $search and $replace arrays. | |
32 * Updated 11/7/03. | |
33 * | |
34 * Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for | |
35 * suggesting the addition of $allowed_tags and its supporting function | |
36 * (which I slightly modified). Updated 3/12/04. | |
37 * | |
38 * Thanks to Justin Dearing for pointing out that a replacement for the | |
39 * <TH> tag was missing, and suggesting an appropriate fix. | |
40 * Updated 8/25/04. | |
41 * | |
42 * Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a | |
43 * display/formatting bug in the _build_link_list() function: email | |
44 * readers would show the left bracket and number ("[1") as part of the | |
45 * rendered email address. | |
46 * Updated 12/16/04. | |
47 * | |
48 * Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code | |
49 * to handle relative links, which I hadn't considered. I modified his | |
50 * code a bit to handle normal HTTP links and MAILTO links. Also for | |
51 * suggesting three additional HTML entity codes to search for. | |
52 * Updated 03/02/05. | |
53 * | |
54 * Thanks to Jacob Chandler for pointing out another link condition | |
55 * for the _build_link_list() function: "https". | |
56 * Updated 04/06/05. | |
57 * | |
58 * Thanks to Marc Bertrand (http://www.dresdensky.com/) for | |
59 * suggesting a revision to the word wrapping functionality; if you | |
60 * specify a $width of 0 or less, word wrapping will be ignored. | |
61 * Updated 11/02/06. | |
62 * | |
63 * *** Big housecleaning updates below: | |
64 * | |
65 * Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for | |
66 * suggesting the fix to handle </li> and blank lines (whitespace). | |
67 * Christian Basedau (http://www.movetheweb.de/) also suggested the | |
68 * blank lines fix. | |
69 * | |
70 * Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/), | |
71 * Christian Basedau, Norbert Laposa (http://ln5.co.uk/), | |
72 * Bas van de Weijer, and Marijn van Butselaar | |
73 * for pointing out my glaring error in the <th> handling. Marcus also | |
74 * supplied a host of fixes. | |
75 * | |
76 * Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing | |
77 * out that extra spaces should be compressed--a problem addressed with | |
78 * Marcus Bointon's fixes but that I had not yet incorporated. | |
79 * | |
80 * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for | |
81 * suggesting a valuable fix with <a> tag handling. | |
82 * | |
83 * Thanks to Wojciech Bajon (again!) for suggesting fixes and additions, | |
84 * including the <a> tag handling that Daniel Schledermann pointed | |
85 * out but that I had not yet incorporated. I haven't (yet) | |
86 * incorporated all of Wojciech's changes, though I may at some | |
87 * future time. | |
88 * | |
89 * *** End of the housecleaning updates. Updated 08/08/07. | |
90 */ | |
91 | |
92 /** | |
93 * Converts HTML to formatted plain text | |
94 * | |
95 * @package Framework | |
96 * @subpackage Utils | |
97 */ | |
98 class rcube_html2text | |
99 { | |
100 /** | |
101 * Contains the HTML content to convert. | |
102 * | |
103 * @var string $html | |
104 */ | |
105 protected $html; | |
106 | |
107 /** | |
108 * Contains the converted, formatted text. | |
109 * | |
110 * @var string $text | |
111 */ | |
112 protected $text; | |
113 | |
114 /** | |
115 * Maximum width of the formatted text, in columns. | |
116 * | |
117 * Set this value to 0 (or less) to ignore word wrapping | |
118 * and not constrain text to a fixed-width column. | |
119 * | |
120 * @var integer $width | |
121 */ | |
122 protected $width = 70; | |
123 | |
124 /** | |
125 * Target character encoding for output text | |
126 * | |
127 * @var string $charset | |
128 */ | |
129 protected $charset = 'UTF-8'; | |
130 | |
131 /** | |
132 * List of preg* regular expression patterns to search for, | |
133 * used in conjunction with $replace. | |
134 * | |
135 * @var array $search | |
136 * @see $replace | |
137 */ | |
138 protected $search = array( | |
139 '/\r/', // Non-legal carriage return | |
140 '/^.*<body[^>]*>\n*/is', // Anything before <body> | |
141 '/<head[^>]*>.*?<\/head>/is', // <head> | |
142 '/<script[^>]*>.*?<\/script>/is', // <script> | |
143 '/<style[^>]*>.*?<\/style>/is', // <style> | |
144 '/[\n\t]+/', // Newlines and tabs | |
145 '/<p[^>]*>/i', // <p> | |
146 '/<\/p>[\s\n\t]*<div[^>]*>/i', // </p> before <div> | |
147 '/<br[^>]*>[\s\n\t]*<div[^>]*>/i', // <br> before <div> | |
148 '/<br[^>]*>\s*/i', // <br> | |
149 '/<i[^>]*>(.*?)<\/i>/i', // <i> | |
150 '/<em[^>]*>(.*?)<\/em>/i', // <em> | |
151 '/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul> | |
152 '/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol> | |
153 '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li> | |
154 '/<li[^>]*>/i', // <li> | |
155 '/<hr[^>]*>/i', // <hr> | |
156 '/<div[^>]*>/i', // <div> | |
157 '/(<table[^>]*>|<\/table>)/i', // <table> and </table> | |
158 '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr> | |
159 '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td> | |
160 ); | |
161 | |
162 /** | |
163 * List of pattern replacements corresponding to patterns searched. | |
164 * | |
165 * @var array $replace | |
166 * @see $search | |
167 */ | |
168 protected $replace = array( | |
169 '', // Non-legal carriage return | |
170 '', // Anything before <body> | |
171 '', // <head> | |
172 '', // <script> | |
173 '', // <style> | |
174 ' ', // Newlines and tabs | |
175 "\n\n", // <p> | |
176 "\n<div>", // </p> before <div> | |
177 '<div>', // <br> before <div> | |
178 "\n", // <br> | |
179 '_\\1_', // <i> | |
180 '_\\1_', // <em> | |
181 "\n\n", // <ul> and </ul> | |
182 "\n\n", // <ol> and </ol> | |
183 "\t* \\1\n", // <li> and </li> | |
184 "\n\t* ", // <li> | |
185 "\n-------------------------\n", // <hr> | |
186 "<div>\n", // <div> | |
187 "\n\n", // <table> and </table> | |
188 "\n", // <tr> and </tr> | |
189 "\t\t\\1\n", // <td> and </td> | |
190 ); | |
191 | |
192 /** | |
193 * List of preg* regular expression patterns to search for, | |
194 * used in conjunction with $ent_replace. | |
195 * | |
196 * @var array $ent_search | |
197 * @see $ent_replace | |
198 */ | |
199 protected $ent_search = array( | |
200 '/&(nbsp|#160);/i', // Non-breaking space | |
201 '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i', | |
202 // Double quotes | |
203 '/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes | |
204 '/>/i', // Greater-than | |
205 '/</i', // Less-than | |
206 '/&(copy|#169);/i', // Copyright | |
207 '/&(trade|#8482|#153);/i', // Trademark | |
208 '/&(reg|#174);/i', // Registered | |
209 '/&(mdash|#151|#8212);/i', // mdash | |
210 '/&(ndash|minus|#8211|#8722);/i', // ndash | |
211 '/&(bull|#149|#8226);/i', // Bullet | |
212 '/&(pound|#163);/i', // Pound sign | |
213 '/&(euro|#8364);/i', // Euro sign | |
214 '/&(amp|#38);/i', // Ampersand: see _converter() | |
215 '/[ ]{2,}/', // Runs of spaces, post-handling | |
216 ); | |
217 | |
218 /** | |
219 * List of pattern replacements corresponding to patterns searched. | |
220 * | |
221 * @var array $ent_replace | |
222 * @see $ent_search | |
223 */ | |
224 protected $ent_replace = array( | |
225 "\xC2\xA0", // Non-breaking space | |
226 '"', // Double quotes | |
227 "'", // Single quotes | |
228 '>', | |
229 '<', | |
230 '(c)', | |
231 '(tm)', | |
232 '(R)', | |
233 '--', | |
234 '-', | |
235 '*', | |
236 '£', | |
237 'EUR', // Euro sign. € | |
238 '|+|amp|+|', // Ampersand: see _converter() | |
239 ' ', // Runs of spaces, post-handling | |
240 ); | |
241 | |
242 /** | |
243 * List of preg* regular expression patterns to search for | |
244 * and replace using callback function. | |
245 * | |
246 * @var array $callback_search | |
247 */ | |
248 protected $callback_search = array( | |
249 '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', // <a href=""> | |
250 '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 | |
251 '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b> | |
252 '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong> | |
253 '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th> | |
254 ); | |
255 | |
256 /** | |
257 * List of preg* regular expression patterns to search for in PRE body, | |
258 * used in conjunction with $pre_replace. | |
259 * | |
260 * @var array $pre_search | |
261 * @see $pre_replace | |
262 */ | |
263 protected $pre_search = array( | |
264 "/\n/", | |
265 "/\t/", | |
266 '/ /', | |
267 '/<pre[^>]*>/', | |
268 '/<\/pre>/' | |
269 ); | |
270 | |
271 /** | |
272 * List of pattern replacements corresponding to patterns searched for PRE body. | |
273 * | |
274 * @var array $pre_replace | |
275 * @see $pre_search | |
276 */ | |
277 protected $pre_replace = array( | |
278 '<br>', | |
279 ' ', | |
280 ' ', | |
281 '', | |
282 '' | |
283 ); | |
284 | |
285 /** | |
286 * Contains a list of HTML tags to allow in the resulting text. | |
287 * | |
288 * @var string $allowed_tags | |
289 * @see set_allowed_tags() | |
290 */ | |
291 protected $allowed_tags = ''; | |
292 | |
293 /** | |
294 * Contains the base URL that relative links should resolve to. | |
295 * | |
296 * @var string $url | |
297 */ | |
298 protected $url; | |
299 | |
300 /** | |
301 * Indicates whether content in the $html variable has been converted yet. | |
302 * | |
303 * @var boolean $_converted | |
304 * @see $html, $text | |
305 */ | |
306 protected $_converted = false; | |
307 | |
308 /** | |
309 * Contains URL addresses from links to be rendered in plain text. | |
310 * | |
311 * @var array $_link_list | |
312 * @see _build_link_list() | |
313 */ | |
314 protected $_link_list = array(); | |
315 | |
316 /** | |
317 * Boolean flag, true if a table of link URLs should be listed after the text. | |
318 * | |
319 * @var boolean $_do_links | |
320 * @see __construct() | |
321 */ | |
322 protected $_do_links = true; | |
323 | |
324 /** | |
325 * Constructor. | |
326 * | |
327 * If the HTML source string (or file) is supplied, the class | |
328 * will instantiate with that source propagated, all that has | |
329 * to be done it to call get_text(). | |
330 * | |
331 * @param string $source HTML content | |
332 * @param boolean $from_file Indicates $source is a file to pull content from | |
333 * @param boolean $do_links Indicate whether a table of link URLs is desired | |
334 * @param integer $width Maximum width of the formatted text, 0 for no limit | |
335 */ | |
336 function __construct($source = '', $from_file = false, $do_links = true, $width = 75, $charset = 'UTF-8') | |
337 { | |
338 if (!empty($source)) { | |
339 $this->set_html($source, $from_file); | |
340 } | |
341 | |
342 $this->set_base_url(); | |
343 | |
344 $this->_do_links = $do_links; | |
345 $this->width = $width; | |
346 $this->charset = $charset; | |
347 } | |
348 | |
349 /** | |
350 * Loads source HTML into memory, either from $source string or a file. | |
351 * | |
352 * @param string $source HTML content | |
353 * @param boolean $from_file Indicates $source is a file to pull content from | |
354 */ | |
355 function set_html($source, $from_file = false) | |
356 { | |
357 if ($from_file && file_exists($source)) { | |
358 $this->html = file_get_contents($source); | |
359 } | |
360 else { | |
361 $this->html = $source; | |
362 } | |
363 | |
364 $this->_converted = false; | |
365 } | |
366 | |
367 /** | |
368 * Returns the text, converted from HTML. | |
369 * | |
370 * @return string Plain text | |
371 */ | |
372 function get_text() | |
373 { | |
374 if (!$this->_converted) { | |
375 $this->_convert(); | |
376 } | |
377 | |
378 return $this->text; | |
379 } | |
380 | |
381 /** | |
382 * Prints the text, converted from HTML. | |
383 */ | |
384 function print_text() | |
385 { | |
386 print $this->get_text(); | |
387 } | |
388 | |
389 /** | |
390 * Sets the allowed HTML tags to pass through to the resulting text. | |
391 * | |
392 * Tags should be in the form "<p>", with no corresponding closing tag. | |
393 */ | |
394 function set_allowed_tags($allowed_tags = '') | |
395 { | |
396 if (!empty($allowed_tags)) { | |
397 $this->allowed_tags = $allowed_tags; | |
398 } | |
399 } | |
400 | |
401 /** | |
402 * Sets a base URL to handle relative links. | |
403 */ | |
404 function set_base_url($url = '') | |
405 { | |
406 if (empty($url)) { | |
407 if (!empty($_SERVER['HTTP_HOST'])) { | |
408 $this->url = 'http://' . $_SERVER['HTTP_HOST']; | |
409 } | |
410 else { | |
411 $this->url = ''; | |
412 } | |
413 } | |
414 else { | |
415 // Strip any trailing slashes for consistency (relative | |
416 // URLs may already start with a slash like "/file.html") | |
417 if (substr($url, -1) == '/') { | |
418 $url = substr($url, 0, -1); | |
419 } | |
420 $this->url = $url; | |
421 } | |
422 } | |
423 | |
424 /** | |
425 * Workhorse function that does actual conversion (calls _converter() method). | |
426 */ | |
427 protected function _convert() | |
428 { | |
429 // Variables used for building the link list | |
430 $this->_link_list = array(); | |
431 | |
432 $text = $this->html; | |
433 | |
434 // Convert HTML to TXT | |
435 $this->_converter($text); | |
436 | |
437 // Add link list | |
438 if (!empty($this->_link_list)) { | |
439 $text .= "\n\nLinks:\n------\n"; | |
440 foreach ($this->_link_list as $idx => $url) { | |
441 $text .= '[' . ($idx+1) . '] ' . $url . "\n"; | |
442 } | |
443 } | |
444 | |
445 $this->text = $text; | |
446 $this->_converted = true; | |
447 } | |
448 | |
449 /** | |
450 * Workhorse function that does actual conversion. | |
451 * | |
452 * First performs custom tag replacement specified by $search and | |
453 * $replace arrays. Then strips any remaining HTML tags, reduces whitespace | |
454 * and newlines to a readable format, and word wraps the text to | |
455 * $width characters. | |
456 * | |
457 * @param string &$text Reference to HTML content string | |
458 */ | |
459 protected function _converter(&$text) | |
460 { | |
461 // Convert <BLOCKQUOTE> (before PRE!) | |
462 $this->_convert_blockquotes($text); | |
463 | |
464 // Convert <PRE> | |
465 $this->_convert_pre($text); | |
466 | |
467 // Run our defined tags search-and-replace | |
468 $text = preg_replace($this->search, $this->replace, $text); | |
469 | |
470 // Run our defined tags search-and-replace with callback | |
471 $text = preg_replace_callback($this->callback_search, array($this, 'tags_preg_callback'), $text); | |
472 | |
473 // Strip any other HTML tags | |
474 $text = strip_tags($text, $this->allowed_tags); | |
475 | |
476 // Run our defined entities/characters search-and-replace | |
477 $text = preg_replace($this->ent_search, $this->ent_replace, $text); | |
478 | |
479 // Replace known html entities | |
480 $text = html_entity_decode($text, ENT_QUOTES, $this->charset); | |
481 | |
482 // Replace unicode nbsp to regular spaces | |
483 $text = preg_replace('/\xC2\xA0/', ' ', $text); | |
484 | |
485 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) | |
486 $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); | |
487 | |
488 // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities | |
489 // This properly handles situation of "&quot;" in input string | |
490 $text = str_replace('|+|amp|+|', '&', $text); | |
491 | |
492 // Bring down number of empty lines to 2 max | |
493 $text = preg_replace("/\n\s+\n/", "\n\n", $text); | |
494 $text = preg_replace("/[\n]{3,}/", "\n\n", $text); | |
495 | |
496 // remove leading empty lines (can be produced by eg. P tag on the beginning) | |
497 $text = ltrim($text, "\n"); | |
498 | |
499 // Wrap the text to a readable format | |
500 // for PHP versions >= 4.0.2. Default width is 75 | |
501 // If width is 0 or less, don't wrap the text. | |
502 if ( $this->width > 0 ) { | |
503 $text = wordwrap($text, $this->width); | |
504 } | |
505 } | |
506 | |
507 /** | |
508 * Helper function called by preg_replace() on link replacement. | |
509 * | |
510 * Maintains an internal list of links to be displayed at the end of the | |
511 * text, with numeric indices to the original point in the text they | |
512 * appeared. Also makes an effort at identifying and handling absolute | |
513 * and relative links. | |
514 * | |
515 * @param string $link URL of the link | |
516 * @param string $display Part of the text to associate number with | |
517 */ | |
518 protected function _build_link_list($link, $display) | |
519 { | |
520 if (!$this->_do_links || empty($link)) { | |
521 return $display; | |
522 } | |
523 | |
524 // Ignored link types | |
525 if (preg_match('!^(javascript:|mailto:|#)!i', $link)) { | |
526 return $display; | |
527 } | |
528 | |
529 // skip links with href == content (#1490434) | |
530 if ($link === $display) { | |
531 return $display; | |
532 } | |
533 | |
534 if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) { | |
535 $url = $link; | |
536 } | |
537 else { | |
538 $url = $this->url; | |
539 if (substr($link, 0, 1) != '/') { | |
540 $url .= '/'; | |
541 } | |
542 $url .= "$link"; | |
543 } | |
544 | |
545 if (($index = array_search($url, $this->_link_list)) === false) { | |
546 $index = count($this->_link_list); | |
547 $this->_link_list[] = $url; | |
548 } | |
549 | |
550 return $display . ' [' . ($index+1) . ']'; | |
551 } | |
552 | |
553 /** | |
554 * Helper function for PRE body conversion. | |
555 * | |
556 * @param string &$text HTML content | |
557 */ | |
558 protected function _convert_pre(&$text) | |
559 { | |
560 // get the content of PRE element | |
561 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { | |
562 $this->pre_content = $matches[1]; | |
563 | |
564 // Run our defined tags search-and-replace with callback | |
565 $this->pre_content = preg_replace_callback($this->callback_search, | |
566 array($this, 'tags_preg_callback'), $this->pre_content); | |
567 | |
568 // convert the content | |
569 $this->pre_content = sprintf('<div><br>%s<br></div>', | |
570 preg_replace($this->pre_search, $this->pre_replace, $this->pre_content)); | |
571 | |
572 // replace the content (use callback because content can contain $0 variable) | |
573 $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU', | |
574 array($this, 'pre_preg_callback'), $text, 1); | |
575 | |
576 // free memory | |
577 $this->pre_content = ''; | |
578 } | |
579 } | |
580 | |
581 /** | |
582 * Helper function for BLOCKQUOTE body conversion. | |
583 * | |
584 * @param string &$text HTML content | |
585 */ | |
586 protected function _convert_blockquotes(&$text) | |
587 { | |
588 $level = 0; | |
589 $offset = 0; | |
590 while (($start = stripos($text, '<blockquote', $offset)) !== false) { | |
591 $offset = $start + 12; | |
592 do { | |
593 $end = stripos($text, '</blockquote>', $offset); | |
594 $next = stripos($text, '<blockquote', $offset); | |
595 | |
596 // nested <blockquote>, skip | |
597 if ($next !== false && $next < $end) { | |
598 $offset = $next + 12; | |
599 $level++; | |
600 } | |
601 // nested </blockquote> tag | |
602 if ($end !== false && $level > 0) { | |
603 $offset = $end + 12; | |
604 $level--; | |
605 } | |
606 // found matching end tag | |
607 else if ($end !== false && $level == 0) { | |
608 $taglen = strpos($text, '>', $start) - $start; | |
609 $startpos = $start + $taglen + 1; | |
610 | |
611 // get blockquote content | |
612 $body = trim(substr($text, $startpos, $end - $startpos)); | |
613 | |
614 // adjust text wrapping width | |
615 $p_width = $this->width; | |
616 if ($this->width > 0) $this->width -= 2; | |
617 | |
618 // replace content with inner blockquotes | |
619 $this->_converter($body); | |
620 | |
621 // resore text width | |
622 $this->width = $p_width; | |
623 | |
624 // Add citation markers and create <pre> block | |
625 $body = preg_replace_callback('/((?:^|\n)>*)([^\n]*)/', array($this, 'blockquote_citation_callback'), trim($body)); | |
626 $body = '<pre>' . htmlspecialchars($body) . '</pre>'; | |
627 | |
628 $text = substr_replace($text, $body . "\n", $start, $end + 13 - $start); | |
629 $offset = 0; | |
630 | |
631 break; | |
632 } | |
633 // abort on invalid tag structure (e.g. no closing tag found) | |
634 else { | |
635 break; | |
636 } | |
637 } | |
638 while ($end || $next); | |
639 } | |
640 } | |
641 | |
642 /** | |
643 * Callback function to correctly add citation markers for blockquote contents | |
644 */ | |
645 public function blockquote_citation_callback($m) | |
646 { | |
647 $line = ltrim($m[2]); | |
648 $space = $line[0] == '>' ? '' : ' '; | |
649 | |
650 return $m[1] . '>' . $space . $line; | |
651 } | |
652 | |
653 /** | |
654 * Callback function for preg_replace_callback use. | |
655 * | |
656 * @param array $matches PREG matches | |
657 * @return string | |
658 */ | |
659 public function tags_preg_callback($matches) | |
660 { | |
661 switch (strtolower($matches[1])) { | |
662 case 'b': | |
663 case 'strong': | |
664 return $this->_toupper($matches[3]); | |
665 case 'th': | |
666 return $this->_toupper("\t\t". $matches[3] ."\n"); | |
667 case 'h': | |
668 return $this->_toupper("\n\n". $matches[3] ."\n\n"); | |
669 case 'a': | |
670 // Remove spaces in URL (#1487805) | |
671 $url = str_replace(' ', '', $matches[3]); | |
672 return $this->_build_link_list($url, $matches[4]); | |
673 } | |
674 } | |
675 | |
676 /** | |
677 * Callback function for preg_replace_callback use in PRE content handler. | |
678 * | |
679 * @param array $matches PREG matches | |
680 * @return string | |
681 */ | |
682 public function pre_preg_callback($matches) | |
683 { | |
684 return $this->pre_content; | |
685 } | |
686 | |
687 /** | |
688 * Strtoupper function with HTML tags and entities handling. | |
689 * | |
690 * @param string $str Text to convert | |
691 * @return string Converted text | |
692 */ | |
693 private function _toupper($str) | |
694 { | |
695 // string can containing HTML tags | |
696 $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); | |
697 | |
698 // convert toupper only the text between HTML tags | |
699 foreach ($chunks as $idx => $chunk) { | |
700 if ($chunk[0] != '<') { | |
701 $chunks[$idx] = $this->_strtoupper($chunk); | |
702 } | |
703 } | |
704 | |
705 return implode($chunks); | |
706 } | |
707 | |
708 /** | |
709 * Strtoupper multibyte wrapper function with HTML entities handling. | |
710 * | |
711 * @param string $str Text to convert | |
712 * @return string Converted text | |
713 */ | |
714 private function _strtoupper($str) | |
715 { | |
716 $str = html_entity_decode($str, ENT_COMPAT, $this->charset); | |
717 $str = mb_strtoupper($str); | |
718 $str = htmlspecialchars($str, ENT_COMPAT, $this->charset); | |
719 | |
720 return $str; | |
721 } | |
722 } |