comparison program/lib/Roundcube/rcube_html2text.php @ 0:4681f974d28b

vanilla 1.3.3 distro, I hope
author Charlie Root
date Thu, 04 Jan 2018 15:52:31 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4681f974d28b
1 <?php
2
3 /**
4 +-----------------------------------------------------------------------+
5 | This file is part of the Roundcube Webmail client |
6 | Copyright (C) 2008-2012, The Roundcube Dev Team |
7 | Copyright (c) 2005-2007, Jon Abernathy <jon@chuggnutt.com> |
8 | |
9 | Licensed under the GNU General Public License version 3 or |
10 | any later version with exceptions for skins & plugins. |
11 | See the README file for a full license statement. |
12 | |
13 | PURPOSE: |
14 | Converts HTML to formatted plain text (based on html2text class) |
15 +-----------------------------------------------------------------------+
16 | Author: Thomas Bruederli <roundcube@gmail.com> |
17 | Author: Aleksander Machniak <alec@alec.pl> |
18 | Author: Jon Abernathy <jon@chuggnutt.com> |
19 +-----------------------------------------------------------------------+
20 */
21
22 /**
23 * Takes HTML and converts it to formatted, plain text.
24 *
25 * Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and
26 * correcting an error in the regexp search array. Fixed 7/30/03.
27 *
28 * Updated set_html() function's file reading mechanism, 9/25/03.
29 *
30 * Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding
31 * several more HTML entity codes to the $search and $replace arrays.
32 * Updated 11/7/03.
33 *
34 * Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for
35 * suggesting the addition of $allowed_tags and its supporting function
36 * (which I slightly modified). Updated 3/12/04.
37 *
38 * Thanks to Justin Dearing for pointing out that a replacement for the
39 * <TH> tag was missing, and suggesting an appropriate fix.
40 * Updated 8/25/04.
41 *
42 * Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a
43 * display/formatting bug in the _build_link_list() function: email
44 * readers would show the left bracket and number ("[1") as part of the
45 * rendered email address.
46 * Updated 12/16/04.
47 *
48 * Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code
49 * to handle relative links, which I hadn't considered. I modified his
50 * code a bit to handle normal HTTP links and MAILTO links. Also for
51 * suggesting three additional HTML entity codes to search for.
52 * Updated 03/02/05.
53 *
54 * Thanks to Jacob Chandler for pointing out another link condition
55 * for the _build_link_list() function: "https".
56 * Updated 04/06/05.
57 *
58 * Thanks to Marc Bertrand (http://www.dresdensky.com/) for
59 * suggesting a revision to the word wrapping functionality; if you
60 * specify a $width of 0 or less, word wrapping will be ignored.
61 * Updated 11/02/06.
62 *
63 * *** Big housecleaning updates below:
64 *
65 * Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for
66 * suggesting the fix to handle </li> and blank lines (whitespace).
67 * Christian Basedau (http://www.movetheweb.de/) also suggested the
68 * blank lines fix.
69 *
70 * Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/),
71 * Christian Basedau, Norbert Laposa (http://ln5.co.uk/),
72 * Bas van de Weijer, and Marijn van Butselaar
73 * for pointing out my glaring error in the <th> handling. Marcus also
74 * supplied a host of fixes.
75 *
76 * Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing
77 * out that extra spaces should be compressed--a problem addressed with
78 * Marcus Bointon's fixes but that I had not yet incorporated.
79 *
80 * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for
81 * suggesting a valuable fix with <a> tag handling.
82 *
83 * Thanks to Wojciech Bajon (again!) for suggesting fixes and additions,
84 * including the <a> tag handling that Daniel Schledermann pointed
85 * out but that I had not yet incorporated. I haven't (yet)
86 * incorporated all of Wojciech's changes, though I may at some
87 * future time.
88 *
89 * *** End of the housecleaning updates. Updated 08/08/07.
90 */
91
92 /**
93 * Converts HTML to formatted plain text
94 *
95 * @package Framework
96 * @subpackage Utils
97 */
98 class rcube_html2text
99 {
100 /**
101 * Contains the HTML content to convert.
102 *
103 * @var string $html
104 */
105 protected $html;
106
107 /**
108 * Contains the converted, formatted text.
109 *
110 * @var string $text
111 */
112 protected $text;
113
114 /**
115 * Maximum width of the formatted text, in columns.
116 *
117 * Set this value to 0 (or less) to ignore word wrapping
118 * and not constrain text to a fixed-width column.
119 *
120 * @var integer $width
121 */
122 protected $width = 70;
123
124 /**
125 * Target character encoding for output text
126 *
127 * @var string $charset
128 */
129 protected $charset = 'UTF-8';
130
131 /**
132 * List of preg* regular expression patterns to search for,
133 * used in conjunction with $replace.
134 *
135 * @var array $search
136 * @see $replace
137 */
138 protected $search = array(
139 '/\r/', // Non-legal carriage return
140 '/^.*<body[^>]*>\n*/is', // Anything before <body>
141 '/<head[^>]*>.*?<\/head>/is', // <head>
142 '/<script[^>]*>.*?<\/script>/is', // <script>
143 '/<style[^>]*>.*?<\/style>/is', // <style>
144 '/[\n\t]+/', // Newlines and tabs
145 '/<p[^>]*>/i', // <p>
146 '/<\/p>[\s\n\t]*<div[^>]*>/i', // </p> before <div>
147 '/<br[^>]*>[\s\n\t]*<div[^>]*>/i', // <br> before <div>
148 '/<br[^>]*>\s*/i', // <br>
149 '/<i[^>]*>(.*?)<\/i>/i', // <i>
150 '/<em[^>]*>(.*?)<\/em>/i', // <em>
151 '/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul>
152 '/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol>
153 '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li>
154 '/<li[^>]*>/i', // <li>
155 '/<hr[^>]*>/i', // <hr>
156 '/<div[^>]*>/i', // <div>
157 '/(<table[^>]*>|<\/table>)/i', // <table> and </table>
158 '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr>
159 '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td>
160 );
161
162 /**
163 * List of pattern replacements corresponding to patterns searched.
164 *
165 * @var array $replace
166 * @see $search
167 */
168 protected $replace = array(
169 '', // Non-legal carriage return
170 '', // Anything before <body>
171 '', // <head>
172 '', // <script>
173 '', // <style>
174 ' ', // Newlines and tabs
175 "\n\n", // <p>
176 "\n<div>", // </p> before <div>
177 '<div>', // <br> before <div>
178 "\n", // <br>
179 '_\\1_', // <i>
180 '_\\1_', // <em>
181 "\n\n", // <ul> and </ul>
182 "\n\n", // <ol> and </ol>
183 "\t* \\1\n", // <li> and </li>
184 "\n\t* ", // <li>
185 "\n-------------------------\n", // <hr>
186 "<div>\n", // <div>
187 "\n\n", // <table> and </table>
188 "\n", // <tr> and </tr>
189 "\t\t\\1\n", // <td> and </td>
190 );
191
192 /**
193 * List of preg* regular expression patterns to search for,
194 * used in conjunction with $ent_replace.
195 *
196 * @var array $ent_search
197 * @see $ent_replace
198 */
199 protected $ent_search = array(
200 '/&(nbsp|#160);/i', // Non-breaking space
201 '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i',
202 // Double quotes
203 '/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes
204 '/&gt;/i', // Greater-than
205 '/&lt;/i', // Less-than
206 '/&(copy|#169);/i', // Copyright
207 '/&(trade|#8482|#153);/i', // Trademark
208 '/&(reg|#174);/i', // Registered
209 '/&(mdash|#151|#8212);/i', // mdash
210 '/&(ndash|minus|#8211|#8722);/i', // ndash
211 '/&(bull|#149|#8226);/i', // Bullet
212 '/&(pound|#163);/i', // Pound sign
213 '/&(euro|#8364);/i', // Euro sign
214 '/&(amp|#38);/i', // Ampersand: see _converter()
215 '/[ ]{2,}/', // Runs of spaces, post-handling
216 );
217
218 /**
219 * List of pattern replacements corresponding to patterns searched.
220 *
221 * @var array $ent_replace
222 * @see $ent_search
223 */
224 protected $ent_replace = array(
225 "\xC2\xA0", // Non-breaking space
226 '"', // Double quotes
227 "'", // Single quotes
228 '>',
229 '<',
230 '(c)',
231 '(tm)',
232 '(R)',
233 '--',
234 '-',
235 '*',
236 '£',
237 'EUR', // Euro sign. €
238 '|+|amp|+|', // Ampersand: see _converter()
239 ' ', // Runs of spaces, post-handling
240 );
241
242 /**
243 * List of preg* regular expression patterns to search for
244 * and replace using callback function.
245 *
246 * @var array $callback_search
247 */
248 protected $callback_search = array(
249 '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', // <a href="">
250 '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6
251 '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b>
252 '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong>
253 '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th>
254 );
255
256 /**
257 * List of preg* regular expression patterns to search for in PRE body,
258 * used in conjunction with $pre_replace.
259 *
260 * @var array $pre_search
261 * @see $pre_replace
262 */
263 protected $pre_search = array(
264 "/\n/",
265 "/\t/",
266 '/ /',
267 '/<pre[^>]*>/',
268 '/<\/pre>/'
269 );
270
271 /**
272 * List of pattern replacements corresponding to patterns searched for PRE body.
273 *
274 * @var array $pre_replace
275 * @see $pre_search
276 */
277 protected $pre_replace = array(
278 '<br>',
279 '&nbsp;&nbsp;&nbsp;&nbsp;',
280 '&nbsp;',
281 '',
282 ''
283 );
284
285 /**
286 * Contains a list of HTML tags to allow in the resulting text.
287 *
288 * @var string $allowed_tags
289 * @see set_allowed_tags()
290 */
291 protected $allowed_tags = '';
292
293 /**
294 * Contains the base URL that relative links should resolve to.
295 *
296 * @var string $url
297 */
298 protected $url;
299
300 /**
301 * Indicates whether content in the $html variable has been converted yet.
302 *
303 * @var boolean $_converted
304 * @see $html, $text
305 */
306 protected $_converted = false;
307
308 /**
309 * Contains URL addresses from links to be rendered in plain text.
310 *
311 * @var array $_link_list
312 * @see _build_link_list()
313 */
314 protected $_link_list = array();
315
316 /**
317 * Boolean flag, true if a table of link URLs should be listed after the text.
318 *
319 * @var boolean $_do_links
320 * @see __construct()
321 */
322 protected $_do_links = true;
323
324 /**
325 * Constructor.
326 *
327 * If the HTML source string (or file) is supplied, the class
328 * will instantiate with that source propagated, all that has
329 * to be done it to call get_text().
330 *
331 * @param string $source HTML content
332 * @param boolean $from_file Indicates $source is a file to pull content from
333 * @param boolean $do_links Indicate whether a table of link URLs is desired
334 * @param integer $width Maximum width of the formatted text, 0 for no limit
335 */
336 function __construct($source = '', $from_file = false, $do_links = true, $width = 75, $charset = 'UTF-8')
337 {
338 if (!empty($source)) {
339 $this->set_html($source, $from_file);
340 }
341
342 $this->set_base_url();
343
344 $this->_do_links = $do_links;
345 $this->width = $width;
346 $this->charset = $charset;
347 }
348
349 /**
350 * Loads source HTML into memory, either from $source string or a file.
351 *
352 * @param string $source HTML content
353 * @param boolean $from_file Indicates $source is a file to pull content from
354 */
355 function set_html($source, $from_file = false)
356 {
357 if ($from_file && file_exists($source)) {
358 $this->html = file_get_contents($source);
359 }
360 else {
361 $this->html = $source;
362 }
363
364 $this->_converted = false;
365 }
366
367 /**
368 * Returns the text, converted from HTML.
369 *
370 * @return string Plain text
371 */
372 function get_text()
373 {
374 if (!$this->_converted) {
375 $this->_convert();
376 }
377
378 return $this->text;
379 }
380
381 /**
382 * Prints the text, converted from HTML.
383 */
384 function print_text()
385 {
386 print $this->get_text();
387 }
388
389 /**
390 * Sets the allowed HTML tags to pass through to the resulting text.
391 *
392 * Tags should be in the form "<p>", with no corresponding closing tag.
393 */
394 function set_allowed_tags($allowed_tags = '')
395 {
396 if (!empty($allowed_tags)) {
397 $this->allowed_tags = $allowed_tags;
398 }
399 }
400
401 /**
402 * Sets a base URL to handle relative links.
403 */
404 function set_base_url($url = '')
405 {
406 if (empty($url)) {
407 if (!empty($_SERVER['HTTP_HOST'])) {
408 $this->url = 'http://' . $_SERVER['HTTP_HOST'];
409 }
410 else {
411 $this->url = '';
412 }
413 }
414 else {
415 // Strip any trailing slashes for consistency (relative
416 // URLs may already start with a slash like "/file.html")
417 if (substr($url, -1) == '/') {
418 $url = substr($url, 0, -1);
419 }
420 $this->url = $url;
421 }
422 }
423
424 /**
425 * Workhorse function that does actual conversion (calls _converter() method).
426 */
427 protected function _convert()
428 {
429 // Variables used for building the link list
430 $this->_link_list = array();
431
432 $text = $this->html;
433
434 // Convert HTML to TXT
435 $this->_converter($text);
436
437 // Add link list
438 if (!empty($this->_link_list)) {
439 $text .= "\n\nLinks:\n------\n";
440 foreach ($this->_link_list as $idx => $url) {
441 $text .= '[' . ($idx+1) . '] ' . $url . "\n";
442 }
443 }
444
445 $this->text = $text;
446 $this->_converted = true;
447 }
448
449 /**
450 * Workhorse function that does actual conversion.
451 *
452 * First performs custom tag replacement specified by $search and
453 * $replace arrays. Then strips any remaining HTML tags, reduces whitespace
454 * and newlines to a readable format, and word wraps the text to
455 * $width characters.
456 *
457 * @param string &$text Reference to HTML content string
458 */
459 protected function _converter(&$text)
460 {
461 // Convert <BLOCKQUOTE> (before PRE!)
462 $this->_convert_blockquotes($text);
463
464 // Convert <PRE>
465 $this->_convert_pre($text);
466
467 // Run our defined tags search-and-replace
468 $text = preg_replace($this->search, $this->replace, $text);
469
470 // Run our defined tags search-and-replace with callback
471 $text = preg_replace_callback($this->callback_search, array($this, 'tags_preg_callback'), $text);
472
473 // Strip any other HTML tags
474 $text = strip_tags($text, $this->allowed_tags);
475
476 // Run our defined entities/characters search-and-replace
477 $text = preg_replace($this->ent_search, $this->ent_replace, $text);
478
479 // Replace known html entities
480 $text = html_entity_decode($text, ENT_QUOTES, $this->charset);
481
482 // Replace unicode nbsp to regular spaces
483 $text = preg_replace('/\xC2\xA0/', ' ', $text);
484
485 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
486 $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
487
488 // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
489 // This properly handles situation of "&amp;quot;" in input string
490 $text = str_replace('|+|amp|+|', '&', $text);
491
492 // Bring down number of empty lines to 2 max
493 $text = preg_replace("/\n\s+\n/", "\n\n", $text);
494 $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
495
496 // remove leading empty lines (can be produced by eg. P tag on the beginning)
497 $text = ltrim($text, "\n");
498
499 // Wrap the text to a readable format
500 // for PHP versions >= 4.0.2. Default width is 75
501 // If width is 0 or less, don't wrap the text.
502 if ( $this->width > 0 ) {
503 $text = wordwrap($text, $this->width);
504 }
505 }
506
507 /**
508 * Helper function called by preg_replace() on link replacement.
509 *
510 * Maintains an internal list of links to be displayed at the end of the
511 * text, with numeric indices to the original point in the text they
512 * appeared. Also makes an effort at identifying and handling absolute
513 * and relative links.
514 *
515 * @param string $link URL of the link
516 * @param string $display Part of the text to associate number with
517 */
518 protected function _build_link_list($link, $display)
519 {
520 if (!$this->_do_links || empty($link)) {
521 return $display;
522 }
523
524 // Ignored link types
525 if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
526 return $display;
527 }
528
529 // skip links with href == content (#1490434)
530 if ($link === $display) {
531 return $display;
532 }
533
534 if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
535 $url = $link;
536 }
537 else {
538 $url = $this->url;
539 if (substr($link, 0, 1) != '/') {
540 $url .= '/';
541 }
542 $url .= "$link";
543 }
544
545 if (($index = array_search($url, $this->_link_list)) === false) {
546 $index = count($this->_link_list);
547 $this->_link_list[] = $url;
548 }
549
550 return $display . ' [' . ($index+1) . ']';
551 }
552
553 /**
554 * Helper function for PRE body conversion.
555 *
556 * @param string &$text HTML content
557 */
558 protected function _convert_pre(&$text)
559 {
560 // get the content of PRE element
561 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
562 $this->pre_content = $matches[1];
563
564 // Run our defined tags search-and-replace with callback
565 $this->pre_content = preg_replace_callback($this->callback_search,
566 array($this, 'tags_preg_callback'), $this->pre_content);
567
568 // convert the content
569 $this->pre_content = sprintf('<div><br>%s<br></div>',
570 preg_replace($this->pre_search, $this->pre_replace, $this->pre_content));
571
572 // replace the content (use callback because content can contain $0 variable)
573 $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU',
574 array($this, 'pre_preg_callback'), $text, 1);
575
576 // free memory
577 $this->pre_content = '';
578 }
579 }
580
581 /**
582 * Helper function for BLOCKQUOTE body conversion.
583 *
584 * @param string &$text HTML content
585 */
586 protected function _convert_blockquotes(&$text)
587 {
588 $level = 0;
589 $offset = 0;
590 while (($start = stripos($text, '<blockquote', $offset)) !== false) {
591 $offset = $start + 12;
592 do {
593 $end = stripos($text, '</blockquote>', $offset);
594 $next = stripos($text, '<blockquote', $offset);
595
596 // nested <blockquote>, skip
597 if ($next !== false && $next < $end) {
598 $offset = $next + 12;
599 $level++;
600 }
601 // nested </blockquote> tag
602 if ($end !== false && $level > 0) {
603 $offset = $end + 12;
604 $level--;
605 }
606 // found matching end tag
607 else if ($end !== false && $level == 0) {
608 $taglen = strpos($text, '>', $start) - $start;
609 $startpos = $start + $taglen + 1;
610
611 // get blockquote content
612 $body = trim(substr($text, $startpos, $end - $startpos));
613
614 // adjust text wrapping width
615 $p_width = $this->width;
616 if ($this->width > 0) $this->width -= 2;
617
618 // replace content with inner blockquotes
619 $this->_converter($body);
620
621 // resore text width
622 $this->width = $p_width;
623
624 // Add citation markers and create <pre> block
625 $body = preg_replace_callback('/((?:^|\n)>*)([^\n]*)/', array($this, 'blockquote_citation_callback'), trim($body));
626 $body = '<pre>' . htmlspecialchars($body) . '</pre>';
627
628 $text = substr_replace($text, $body . "\n", $start, $end + 13 - $start);
629 $offset = 0;
630
631 break;
632 }
633 // abort on invalid tag structure (e.g. no closing tag found)
634 else {
635 break;
636 }
637 }
638 while ($end || $next);
639 }
640 }
641
642 /**
643 * Callback function to correctly add citation markers for blockquote contents
644 */
645 public function blockquote_citation_callback($m)
646 {
647 $line = ltrim($m[2]);
648 $space = $line[0] == '>' ? '' : ' ';
649
650 return $m[1] . '>' . $space . $line;
651 }
652
653 /**
654 * Callback function for preg_replace_callback use.
655 *
656 * @param array $matches PREG matches
657 * @return string
658 */
659 public function tags_preg_callback($matches)
660 {
661 switch (strtolower($matches[1])) {
662 case 'b':
663 case 'strong':
664 return $this->_toupper($matches[3]);
665 case 'th':
666 return $this->_toupper("\t\t". $matches[3] ."\n");
667 case 'h':
668 return $this->_toupper("\n\n". $matches[3] ."\n\n");
669 case 'a':
670 // Remove spaces in URL (#1487805)
671 $url = str_replace(' ', '', $matches[3]);
672 return $this->_build_link_list($url, $matches[4]);
673 }
674 }
675
676 /**
677 * Callback function for preg_replace_callback use in PRE content handler.
678 *
679 * @param array $matches PREG matches
680 * @return string
681 */
682 public function pre_preg_callback($matches)
683 {
684 return $this->pre_content;
685 }
686
687 /**
688 * Strtoupper function with HTML tags and entities handling.
689 *
690 * @param string $str Text to convert
691 * @return string Converted text
692 */
693 private function _toupper($str)
694 {
695 // string can containing HTML tags
696 $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
697
698 // convert toupper only the text between HTML tags
699 foreach ($chunks as $idx => $chunk) {
700 if ($chunk[0] != '<') {
701 $chunks[$idx] = $this->_strtoupper($chunk);
702 }
703 }
704
705 return implode($chunks);
706 }
707
708 /**
709 * Strtoupper multibyte wrapper function with HTML entities handling.
710 *
711 * @param string $str Text to convert
712 * @return string Converted text
713 */
714 private function _strtoupper($str)
715 {
716 $str = html_entity_decode($str, ENT_COMPAT, $this->charset);
717 $str = mb_strtoupper($str);
718 $str = htmlspecialchars($str, ENT_COMPAT, $this->charset);
719
720 return $str;
721 }
722 }