comparison program/lib/Roundcube/rcube_charset.php @ 0:4681f974d28b

vanilla 1.3.3 distro, I hope
author Charlie Root
date Thu, 04 Jan 2018 15:52:31 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4681f974d28b
1 <?php
2
3 /**
4 +-----------------------------------------------------------------------+
5 | This file is part of the Roundcube Webmail client |
6 | Copyright (C) 2005-2012, The Roundcube Dev Team |
7 | Copyright (C) 2011-2012, Kolab Systems AG |
8 | Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org> |
9 | |
10 | Licensed under the GNU General Public License version 3 or |
11 | any later version with exceptions for skins & plugins. |
12 | See the README file for a full license statement. |
13 | |
14 | PURPOSE: |
15 | Provide charset conversion functionality |
16 +-----------------------------------------------------------------------+
17 | Author: Thomas Bruederli <roundcube@gmail.com> |
18 | Author: Aleksander Machniak <alec@alec.pl> |
19 +-----------------------------------------------------------------------+
20 */
21
22 /**
23 * Character sets conversion functionality
24 *
25 * @package Framework
26 * @subpackage Core
27 * @author Thomas Bruederli <roundcube@gmail.com>
28 * @author Aleksander Machniak <alec@alec.pl>
29 * @author Edmund Grimley Evans <edmundo@rano.org>
30 */
31 class rcube_charset
32 {
33 // Aliases: some of them from HTML5 spec.
34 static public $aliases = array(
35 'USASCII' => 'WINDOWS-1252',
36 'ANSIX31101983' => 'WINDOWS-1252',
37 'ANSIX341968' => 'WINDOWS-1252',
38 'UNKNOWN8BIT' => 'ISO-8859-15',
39 'UNKNOWN' => 'ISO-8859-15',
40 'USERDEFINED' => 'ISO-8859-15',
41 'KSC56011987' => 'EUC-KR',
42 'GB2312' => 'GBK',
43 'GB231280' => 'GBK',
44 'UNICODE' => 'UTF-8',
45 'UTF7IMAP' => 'UTF7-IMAP',
46 'TIS620' => 'WINDOWS-874',
47 'ISO88599' => 'WINDOWS-1254',
48 'ISO885911' => 'WINDOWS-874',
49 'MACROMAN' => 'MACINTOSH',
50 '77' => 'MAC',
51 '128' => 'SHIFT-JIS',
52 '129' => 'CP949',
53 '130' => 'CP1361',
54 '134' => 'GBK',
55 '136' => 'BIG5',
56 '161' => 'WINDOWS-1253',
57 '162' => 'WINDOWS-1254',
58 '163' => 'WINDOWS-1258',
59 '177' => 'WINDOWS-1255',
60 '178' => 'WINDOWS-1256',
61 '186' => 'WINDOWS-1257',
62 '204' => 'WINDOWS-1251',
63 '222' => 'WINDOWS-874',
64 '238' => 'WINDOWS-1250',
65 'MS950' => 'CP950',
66 'WINDOWS949' => 'UHC',
67 );
68
69 /**
70 * Windows codepages
71 *
72 * @var array
73 */
74 static public $windows_codepages = array(
75 37 => 'IBM037', // IBM EBCDIC US-Canada
76 437 => 'IBM437', // OEM United States
77 500 => 'IBM500', // IBM EBCDIC International
78 708 => 'ASMO-708', // Arabic (ASMO 708)
79 720 => 'DOS-720', // Arabic (Transparent ASMO); Arabic (DOS)
80 737 => 'IBM737', // OEM Greek (formerly 437G); Greek (DOS)
81 775 => 'IBM775', // OEM Baltic; Baltic (DOS)
82 850 => 'IBM850', // OEM Multilingual Latin 1; Western European (DOS)
83 852 => 'IBM852', // OEM Latin 2; Central European (DOS)
84 855 => 'IBM855', // OEM Cyrillic (primarily Russian)
85 857 => 'IBM857', // OEM Turkish; Turkish (DOS)
86 858 => 'IBM00858', // OEM Multilingual Latin 1 + Euro symbol
87 860 => 'IBM860', // OEM Portuguese; Portuguese (DOS)
88 861 => 'IBM861', // OEM Icelandic; Icelandic (DOS)
89 862 => 'DOS-862', // OEM Hebrew; Hebrew (DOS)
90 863 => 'IBM863', // OEM French Canadian; French Canadian (DOS)
91 864 => 'IBM864', // OEM Arabic; Arabic (864)
92 865 => 'IBM865', // OEM Nordic; Nordic (DOS)
93 866 => 'cp866', // OEM Russian; Cyrillic (DOS)
94 869 => 'IBM869', // OEM Modern Greek; Greek, Modern (DOS)
95 870 => 'IBM870', // IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
96 874 => 'windows-874', // ANSI/OEM Thai (ISO 8859-11); Thai (Windows)
97 875 => 'cp875', // IBM EBCDIC Greek Modern
98 932 => 'shift_jis', // ANSI/OEM Japanese; Japanese (Shift-JIS)
99 936 => 'gb2312', // ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
100 950 => 'big5', // ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
101 1026 => 'IBM1026', // IBM EBCDIC Turkish (Latin 5)
102 1047 => 'IBM01047', // IBM EBCDIC Latin 1/Open System
103 1140 => 'IBM01140', // IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
104 1141 => 'IBM01141', // IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
105 1142 => 'IBM01142', // IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
106 1143 => 'IBM01143', // IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
107 1144 => 'IBM01144', // IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
108 1145 => 'IBM01145', // IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
109 1146 => 'IBM01146', // IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
110 1147 => 'IBM01147', // IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
111 1148 => 'IBM01148', // IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
112 1149 => 'IBM01149', // IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
113 1200 => 'UTF-16', // Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
114 1201 => 'UTF-16BE', // Unicode UTF-16, big endian byte order; available only to managed applications
115 1250 => 'windows-1250', // ANSI Central European; Central European (Windows)
116 1251 => 'windows-1251', // ANSI Cyrillic; Cyrillic (Windows)
117 1252 => 'windows-1252', // ANSI Latin 1; Western European (Windows)
118 1253 => 'windows-1253', // ANSI Greek; Greek (Windows)
119 1254 => 'windows-1254', // ANSI Turkish; Turkish (Windows)
120 1255 => 'windows-1255', // ANSI Hebrew; Hebrew (Windows)
121 1256 => 'windows-1256', // ANSI Arabic; Arabic (Windows)
122 1257 => 'windows-1257', // ANSI Baltic; Baltic (Windows)
123 1258 => 'windows-1258', // ANSI/OEM Vietnamese; Vietnamese (Windows)
124 10000 => 'macintosh', // MAC Roman; Western European (Mac)
125 12000 => 'UTF-32', // Unicode UTF-32, little endian byte order; available only to managed applications
126 12001 => 'UTF-32BE', // Unicode UTF-32, big endian byte order; available only to managed applications
127 20127 => 'US-ASCII', // US-ASCII (7-bit)
128 20273 => 'IBM273', // IBM EBCDIC Germany
129 20277 => 'IBM277', // IBM EBCDIC Denmark-Norway
130 20278 => 'IBM278', // IBM EBCDIC Finland-Sweden
131 20280 => 'IBM280', // IBM EBCDIC Italy
132 20284 => 'IBM284', // IBM EBCDIC Latin America-Spain
133 20285 => 'IBM285', // IBM EBCDIC United Kingdom
134 20290 => 'IBM290', // IBM EBCDIC Japanese Katakana Extended
135 20297 => 'IBM297', // IBM EBCDIC France
136 20420 => 'IBM420', // IBM EBCDIC Arabic
137 20423 => 'IBM423', // IBM EBCDIC Greek
138 20424 => 'IBM424', // IBM EBCDIC Hebrew
139 20838 => 'IBM-Thai', // IBM EBCDIC Thai
140 20866 => 'koi8-r', // Russian (KOI8-R); Cyrillic (KOI8-R)
141 20871 => 'IBM871', // IBM EBCDIC Icelandic
142 20880 => 'IBM880', // IBM EBCDIC Cyrillic Russian
143 20905 => 'IBM905', // IBM EBCDIC Turkish
144 20924 => 'IBM00924', // IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
145 20932 => 'EUC-JP', // Japanese (JIS 0208-1990 and 0212-1990)
146 20936 => 'cp20936', // Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
147 20949 => 'cp20949', // Korean Wansung
148 21025 => 'cp1025', // IBM EBCDIC Cyrillic Serbian-Bulgarian
149 21866 => 'koi8-u', // Ukrainian (KOI8-U); Cyrillic (KOI8-U)
150 28591 => 'iso-8859-1', // ISO 8859-1 Latin 1; Western European (ISO)
151 28592 => 'iso-8859-2', // ISO 8859-2 Central European; Central European (ISO)
152 28593 => 'iso-8859-3', // ISO 8859-3 Latin 3
153 28594 => 'iso-8859-4', // ISO 8859-4 Baltic
154 28595 => 'iso-8859-5', // ISO 8859-5 Cyrillic
155 28596 => 'iso-8859-6', // ISO 8859-6 Arabic
156 28597 => 'iso-8859-7', // ISO 8859-7 Greek
157 28598 => 'iso-8859-8', // ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
158 28599 => 'iso-8859-9', // ISO 8859-9 Turkish
159 28603 => 'iso-8859-13', // ISO 8859-13 Estonian
160 28605 => 'iso-8859-15', // ISO 8859-15 Latin 9
161 38598 => 'iso-8859-8-i', // ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
162 50220 => 'iso-2022-jp', // ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
163 50221 => 'csISO2022JP', // ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
164 50222 => 'iso-2022-jp', // ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
165 50225 => 'iso-2022-kr', // ISO 2022 Korean
166 51932 => 'EUC-JP', // EUC Japanese
167 51936 => 'EUC-CN', // EUC Simplified Chinese; Chinese Simplified (EUC)
168 51949 => 'EUC-KR', // EUC Korean
169 52936 => 'hz-gb-2312', // HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
170 54936 => 'GB18030', // Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
171 65000 => 'UTF-7',
172 65001 => 'UTF-8',
173 );
174
175 /**
176 * Catch an error and throw an exception.
177 *
178 * @param int $errno Level of the error
179 * @param string $errstr Error message
180 */
181 public static function error_handler($errno, $errstr)
182 {
183 throw new ErrorException($errstr, 0, $errno);
184 }
185
186 /**
187 * Parse and validate charset name string (see #1485758).
188 * Sometimes charset string is malformed, there are also charset aliases
189 * but we need strict names for charset conversion (specially utf8 class)
190 *
191 * @param string $input Input charset name
192 *
193 * @return string The validated charset name
194 */
195 public static function parse_charset($input)
196 {
197 static $charsets = array();
198 $charset = strtoupper($input);
199
200 if (isset($charsets[$input])) {
201 return $charsets[$input];
202 }
203
204 $charset = preg_replace(array(
205 '/^[^0-9A-Z]+/', // e.g. _ISO-8859-JP$SIO
206 '/\$.*$/', // e.g. _ISO-8859-JP$SIO
207 '/UNICODE-1-1-*/', // RFC1641/1642
208 '/^X-/', // X- prefix (e.g. X-ROMAN8 => ROMAN8)
209 ), '', $charset);
210
211 if ($charset == 'BINARY') {
212 return $charsets[$input] = null;
213 }
214
215 // allow A-Z and 0-9 only
216 $str = preg_replace('/[^A-Z0-9]/', '', $charset);
217
218 if (isset(self::$aliases[$str])) {
219 $result = self::$aliases[$str];
220 }
221 // UTF
222 else if (preg_match('/U[A-Z][A-Z](7|8|16|32)(BE|LE)*/', $str, $m)) {
223 $result = 'UTF-' . $m[1] . $m[2];
224 }
225 // ISO-8859
226 else if (preg_match('/ISO8859([0-9]{0,2})/', $str, $m)) {
227 $iso = 'ISO-8859-' . ($m[1] ?: 1);
228 // some clients sends windows-1252 text as latin1,
229 // it is safe to use windows-1252 for all latin1
230 $result = $iso == 'ISO-8859-1' ? 'WINDOWS-1252' : $iso;
231 }
232 // handle broken charset names e.g. WINDOWS-1250HTTP-EQUIVCONTENT-TYPE
233 else if (preg_match('/(WIN|WINDOWS)([0-9]+)/', $str, $m)) {
234 $result = 'WINDOWS-' . $m[2];
235 }
236 // LATIN
237 else if (preg_match('/LATIN(.*)/', $str, $m)) {
238 $aliases = array('2' => 2, '3' => 3, '4' => 4, '5' => 9, '6' => 10,
239 '7' => 13, '8' => 14, '9' => 15, '10' => 16,
240 'ARABIC' => 6, 'CYRILLIC' => 5, 'GREEK' => 7, 'GREEK1' => 7, 'HEBREW' => 8
241 );
242
243 // some clients sends windows-1252 text as latin1,
244 // it is safe to use windows-1252 for all latin1
245 if ($m[1] == 1) {
246 $result = 'WINDOWS-1252';
247 }
248 // if iconv is not supported we need ISO labels, it's also safe for iconv
249 else if (!empty($aliases[$m[1]])) {
250 $result = 'ISO-8859-'.$aliases[$m[1]];
251 }
252 // iconv requires conversion of e.g. LATIN-1 to LATIN1
253 else {
254 $result = $str;
255 }
256 }
257 else {
258 $result = $charset;
259 }
260
261 $charsets[$input] = $result;
262
263 return $result;
264 }
265
266 /**
267 * Convert a string from one charset to another.
268 * Uses mbstring and iconv functions if possible
269 *
270 * @param string $str Input string
271 * @param string $from Suspected charset of the input string
272 * @param string $to Target charset to convert to; defaults to RCUBE_CHARSET
273 *
274 * @return string Converted string
275 */
276 public static function convert($str, $from, $to = null)
277 {
278 static $iconv_options = null;
279 static $mbstring_sc = null;
280
281 $to = empty($to) ? RCUBE_CHARSET : strtoupper($to);
282 $from = self::parse_charset($from);
283
284 // It is a common case when UTF-16 charset is used with US-ASCII content (#1488654)
285 // In that case we can just skip the conversion (use UTF-8)
286 if ($from == 'UTF-16' && !preg_match('/[^\x00-\x7F]/', $str)) {
287 $from = 'UTF-8';
288 }
289
290 if ($from == $to || empty($str) || empty($from)) {
291 return $str;
292 }
293
294 if ($iconv_options === null) {
295 if (function_exists('iconv')) {
296 // ignore characters not available in output charset
297 $iconv_options = '//IGNORE';
298 if (iconv('', $iconv_options, '') === false) {
299 // iconv implementation does not support options
300 $iconv_options = '';
301 }
302 }
303 else {
304 $iconv_options = false;
305 }
306 }
307
308 // convert charset using iconv module
309 if ($iconv_options !== false && $from != 'UTF7-IMAP' && $to != 'UTF7-IMAP'
310 && $from !== 'ISO-2022-JP'
311 ) {
312 // throw an exception if iconv reports an illegal character in input
313 // it means that input string has been truncated
314 set_error_handler(array('rcube_charset', 'error_handler'), E_NOTICE);
315 try {
316 $out = iconv($from, $to . $iconv_options, $str);
317 }
318 catch (ErrorException $e) {
319 $out = false;
320 }
321 restore_error_handler();
322
323 if ($out !== false) {
324 return $out;
325 }
326 }
327
328 if ($mbstring_sc === null) {
329 $mbstring_sc = extension_loaded('mbstring') ? mb_substitute_character() : false;
330 }
331
332 // convert charset using mbstring module
333 if ($mbstring_sc !== false) {
334 $aliases = array(
335 'WINDOWS-1257' => 'ISO-8859-13',
336 'US-ASCII' => 'ASCII',
337 'ISO-2022-JP' => 'ISO-2022-JP-MS',
338 );
339
340 $mb_from = $aliases[$from] ?: $from;
341 $mb_to = $aliases[$to] ?: $to;
342
343 // Do the same as //IGNORE with iconv
344 mb_substitute_character('none');
345
346 // throw an exception if mbstring reports an illegal character in input
347 // using mb_check_encoding() is much slower
348 set_error_handler(array('rcube_charset', 'error_handler'), E_WARNING);
349 try {
350 $out = mb_convert_encoding($str, $mb_to, $mb_from);
351 }
352 catch (ErrorException $e) {
353 $out = false;
354 }
355 restore_error_handler();
356
357 mb_substitute_character($mbstring_sc);
358
359 if ($out !== false) {
360 return $out;
361 }
362 }
363
364 // convert charset using bundled classes/functions
365 if ($to == 'UTF-8') {
366 if ($from == 'UTF7-IMAP') {
367 if ($out = self::utf7imap_to_utf8($str)) {
368 return $out;
369 }
370 }
371 else if ($from == 'UTF-7') {
372 if ($out = self::utf7_to_utf8($str)) {
373 return $out;
374 }
375 }
376 }
377
378 // encode string for output
379 if ($from == 'UTF-8') {
380 // @TODO: we need a function for UTF-7 (RFC2152) conversion
381 if ($to == 'UTF7-IMAP' || $to == 'UTF-7') {
382 if ($out = self::utf8_to_utf7imap($str)) {
383 return $out;
384 }
385 }
386 }
387
388 if (!isset($out)) {
389 trigger_error("No suitable function found for '$from' to '$to' conversion");
390 }
391
392 // return original string
393 return $str;
394 }
395
396 /**
397 * Converts string from standard UTF-7 (RFC 2152) to UTF-8.
398 *
399 * @param string $str Input string (UTF-7)
400 *
401 * @return string Converted string (UTF-8)
402 */
403 public static function utf7_to_utf8($str)
404 {
405 $Index_64 = array(
406 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
407 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
408 0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,0,0,
409 1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,
410 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
411 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
412 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
413 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
414 );
415
416 $u7len = strlen($str);
417 $str = strval($str);
418 $res = '';
419
420 for ($i=0; $u7len > 0; $i++, $u7len--) {
421 $u7 = $str[$i];
422 if ($u7 == '+') {
423 $i++;
424 $u7len--;
425 $ch = '';
426
427 for (; $u7len > 0; $i++, $u7len--) {
428 $u7 = $str[$i];
429
430 if (!$Index_64[ord($u7)]) {
431 break;
432 }
433
434 $ch .= $u7;
435 }
436
437 if ($ch == '') {
438 if ($u7 == '-') {
439 $res .= '+';
440 }
441
442 continue;
443 }
444
445 $res .= self::utf16_to_utf8(base64_decode($ch));
446 }
447 else {
448 $res .= $u7;
449 }
450 }
451
452 return $res;
453 }
454
455 /**
456 * Converts string from UTF-16 to UTF-8 (helper for utf-7 to utf-8 conversion)
457 *
458 * @param string $str Input string
459 *
460 * @return string The converted string
461 */
462 public static function utf16_to_utf8($str)
463 {
464 $len = strlen($str);
465 $dec = '';
466
467 for ($i = 0; $i < $len; $i += 2) {
468 $c = ord($str[$i]) << 8 | ord($str[$i + 1]);
469 if ($c >= 0x0001 && $c <= 0x007F) {
470 $dec .= chr($c);
471 }
472 else if ($c > 0x07FF) {
473 $dec .= chr(0xE0 | (($c >> 12) & 0x0F));
474 $dec .= chr(0x80 | (($c >> 6) & 0x3F));
475 $dec .= chr(0x80 | (($c >> 0) & 0x3F));
476 }
477 else {
478 $dec .= chr(0xC0 | (($c >> 6) & 0x1F));
479 $dec .= chr(0x80 | (($c >> 0) & 0x3F));
480 }
481 }
482
483 return $dec;
484 }
485
486 /**
487 * Convert the data ($str) from RFC 2060's UTF-7 to UTF-8.
488 * If input data is invalid, return the original input string.
489 * RFC 2060 obviously intends the encoding to be unique (see
490 * point 5 in section 5.1.3), so we reject any non-canonical
491 * form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead
492 * of &AMAAwA-).
493 *
494 * Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com>
495 *
496 * @param string $str Input string (UTF7-IMAP)
497 *
498 * @return string Output string (UTF-8)
499 */
500 public static function utf7imap_to_utf8($str)
501 {
502 $Index_64 = array(
503 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
504 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
505 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1,
506 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
507 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
508 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
509 -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
510 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
511 );
512
513 $u7len = strlen($str);
514 $str = strval($str);
515 $p = '';
516 $err = '';
517
518 for ($i=0; $u7len > 0; $i++, $u7len--) {
519 $u7 = $str[$i];
520 if ($u7 == '&') {
521 $i++;
522 $u7len--;
523 $u7 = $str[$i];
524
525 if ($u7len && $u7 == '-') {
526 $p .= '&';
527 continue;
528 }
529
530 $ch = 0;
531 $k = 10;
532 for (; $u7len > 0; $i++, $u7len--) {
533 $u7 = $str[$i];
534
535 if ((ord($u7) & 0x80) || ($b = $Index_64[ord($u7)]) == -1) {
536 break;
537 }
538
539 if ($k > 0) {
540 $ch |= $b << $k;
541 $k -= 6;
542 }
543 else {
544 $ch |= $b >> (-$k);
545 if ($ch < 0x80) {
546 // Printable US-ASCII
547 if (0x20 <= $ch && $ch < 0x7f) {
548 return $err;
549 }
550 $p .= chr($ch);
551 }
552 else if ($ch < 0x800) {
553 $p .= chr(0xc0 | ($ch >> 6));
554 $p .= chr(0x80 | ($ch & 0x3f));
555 }
556 else {
557 $p .= chr(0xe0 | ($ch >> 12));
558 $p .= chr(0x80 | (($ch >> 6) & 0x3f));
559 $p .= chr(0x80 | ($ch & 0x3f));
560 }
561
562 $ch = ($b << (16 + $k)) & 0xffff;
563 $k += 10;
564 }
565 }
566
567 // Non-zero or too many extra bits
568 if ($ch || $k < 6) {
569 return $err;
570 }
571
572 // BASE64 not properly terminated
573 if (!$u7len || $u7 != '-') {
574 return $err;
575 }
576
577 // Adjacent BASE64 sections
578 if ($u7len > 2 && $str[$i+1] == '&' && $str[$i+2] != '-') {
579 return $err;
580 }
581 }
582 // Not printable US-ASCII
583 else if (ord($u7) < 0x20 || ord($u7) >= 0x7f) {
584 return $err;
585 }
586 else {
587 $p .= $u7;
588 }
589 }
590
591 return $p;
592 }
593
594 /**
595 * Convert the data ($str) from UTF-8 to RFC 2060's UTF-7.
596 * Unicode characters above U+FFFF are replaced by U+FFFE.
597 * If input data is invalid, return an empty string.
598 *
599 * Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com>
600 *
601 * @param string $str Input string (UTF-8)
602 *
603 * @return string Output string (UTF7-IMAP)
604 */
605 public static function utf8_to_utf7imap($str)
606 {
607 $B64Chars = array(
608 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
609 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
610 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
611 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
612 '8', '9', '+', ','
613 );
614
615 $u8len = strlen($str);
616 $base64 = 0;
617 $i = 0;
618 $p = '';
619 $err = '';
620
621 while ($u8len) {
622 $u8 = $str[$i];
623 $c = ord($u8);
624
625 if ($c < 0x80) {
626 $ch = $c;
627 $n = 0;
628 }
629 else if ($c < 0xc2) {
630 return $err;
631 }
632 else if ($c < 0xe0) {
633 $ch = $c & 0x1f;
634 $n = 1;
635 }
636 else if ($c < 0xf0) {
637 $ch = $c & 0x0f;
638 $n = 2;
639 }
640 else if ($c < 0xf8) {
641 $ch = $c & 0x07;
642 $n = 3;
643 }
644 else if ($c < 0xfc) {
645 $ch = $c & 0x03;
646 $n = 4;
647 }
648 else if ($c < 0xfe) {
649 $ch = $c & 0x01;
650 $n = 5;
651 }
652 else {
653 return $err;
654 }
655
656 $i++;
657 $u8len--;
658
659 if ($n > $u8len) {
660 return $err;
661 }
662
663 for ($j=0; $j < $n; $j++) {
664 $o = ord($str[$i+$j]);
665 if (($o & 0xc0) != 0x80) {
666 return $err;
667 }
668 $ch = ($ch << 6) | ($o & 0x3f);
669 }
670
671 if ($n > 1 && !($ch >> ($n * 5 + 1))) {
672 return $err;
673 }
674
675 $i += $n;
676 $u8len -= $n;
677
678 if ($ch < 0x20 || $ch >= 0x7f) {
679 if (!$base64) {
680 $p .= '&';
681 $base64 = 1;
682 $b = 0;
683 $k = 10;
684 }
685 if ($ch & ~0xffff) {
686 $ch = 0xfffe;
687 }
688
689 $p .= $B64Chars[($b | $ch >> $k)];
690 $k -= 6;
691 for (; $k >= 0; $k -= 6) {
692 $p .= $B64Chars[(($ch >> $k) & 0x3f)];
693 }
694
695 $b = ($ch << (-$k)) & 0x3f;
696 $k += 16;
697 }
698 else {
699 if ($base64) {
700 if ($k > 10) {
701 $p .= $B64Chars[$b];
702 }
703 $p .= '-';
704 $base64 = 0;
705 }
706
707 $p .= chr($ch);
708 if (chr($ch) == '&') {
709 $p .= '-';
710 }
711 }
712 }
713
714 if ($base64) {
715 if ($k > 10) {
716 $p .= $B64Chars[$b];
717 }
718 $p .= '-';
719 }
720
721 return $p;
722 }
723
724 /**
725 * A method to guess character set of a string.
726 *
727 * @param string $string String
728 * @param string $failover Default result for failover
729 * @param string $language User language
730 *
731 * @return string Charset name
732 */
733 public static function detect($string, $failover = null, $language = null)
734 {
735 if (substr($string, 0, 4) == "\0\0\xFE\xFF") return 'UTF-32BE'; // Big Endian
736 if (substr($string, 0, 4) == "\xFF\xFE\0\0") return 'UTF-32LE'; // Little Endian
737 if (substr($string, 0, 2) == "\xFE\xFF") return 'UTF-16BE'; // Big Endian
738 if (substr($string, 0, 2) == "\xFF\xFE") return 'UTF-16LE'; // Little Endian
739 if (substr($string, 0, 3) == "\xEF\xBB\xBF") return 'UTF-8';
740
741 // heuristics
742 if ($string[0] == "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-32BE';
743 if ($string[0] != "\0" && $string[1] == "\0" && $string[2] == "\0" && $string[3] == "\0") return 'UTF-32LE';
744 if ($string[0] == "\0" && $string[1] != "\0" && $string[2] == "\0" && $string[3] != "\0") return 'UTF-16BE';
745 if ($string[0] != "\0" && $string[1] == "\0" && $string[2] != "\0" && $string[3] == "\0") return 'UTF-16LE';
746
747 if (empty($language)) {
748 $rcube = rcube::get_instance();
749 $language = $rcube->get_user_language();
750 }
751
752 // Prioritize charsets according to current language (#1485669)
753 switch ($language) {
754 case 'ja_JP':
755 $prio = array('ISO-2022-JP', 'JIS', 'UTF-8', 'EUC-JP', 'eucJP-win', 'SJIS', 'SJIS-win');
756 break;
757
758 case 'zh_CN':
759 case 'zh_TW':
760 $prio = array('UTF-8', 'BIG-5', 'GB2312', 'EUC-TW');
761 break;
762
763 case 'ko_KR':
764 $prio = array('UTF-8', 'EUC-KR', 'ISO-2022-KR');
765 break;
766
767 case 'ru_RU':
768 $prio = array('UTF-8', 'WINDOWS-1251', 'KOI8-R');
769 break;
770
771 case 'tr_TR':
772 $prio = array('UTF-8', 'ISO-8859-9', 'WINDOWS-1254');
773 break;
774 }
775
776 // mb_detect_encoding() is not reliable for some charsets (#1490135)
777 // use mb_check_encoding() to make charset priority lists really working
778 if ($prio && function_exists('mb_check_encoding')) {
779 foreach ($prio as $encoding) {
780 if (mb_check_encoding($string, $encoding)) {
781 return $encoding;
782 }
783 }
784 }
785
786 if (function_exists('mb_detect_encoding')) {
787 if (!$prio) {
788 $prio = array('UTF-8', 'SJIS', 'GB2312',
789 'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
790 'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9',
791 'ISO-8859-10', 'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'ISO-8859-16',
792 'WINDOWS-1252', 'WINDOWS-1251', 'EUC-JP', 'EUC-TW', 'KOI8-R', 'BIG-5',
793 'ISO-2022-KR', 'ISO-2022-JP',
794 );
795 }
796
797 $encodings = array_unique(array_merge($prio, mb_list_encodings()));
798
799 if ($encoding = mb_detect_encoding($string, $encodings)) {
800 return $encoding;
801 }
802 }
803
804 // No match, check for UTF-8
805 // from http://w3.org/International/questions/qa-forms-utf-8.html
806 if (preg_match('/\A(
807 [\x09\x0A\x0D\x20-\x7E]
808 | [\xC2-\xDF][\x80-\xBF]
809 | \xE0[\xA0-\xBF][\x80-\xBF]
810 | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}
811 | \xED[\x80-\x9F][\x80-\xBF]
812 | \xF0[\x90-\xBF][\x80-\xBF]{2}
813 | [\xF1-\xF3][\x80-\xBF]{3}
814 | \xF4[\x80-\x8F][\x80-\xBF]{2}
815 )*\z/xs', substr($string, 0, 2048))
816 ) {
817 return 'UTF-8';
818 }
819
820 return $failover;
821 }
822
823 /**
824 * Removes non-unicode characters from input.
825 *
826 * @param mixed $input String or array.
827 *
828 * @return mixed String or array
829 */
830 public static function clean($input)
831 {
832 // handle input of type array
833 if (is_array($input)) {
834 foreach ($input as $idx => $val) {
835 $input[$idx] = self::clean($val);
836 }
837 return $input;
838 }
839
840 if (!is_string($input) || $input == '') {
841 return $input;
842 }
843
844 // iconv/mbstring are much faster (especially with long strings)
845 if (function_exists('mb_convert_encoding')) {
846 $msch = mb_substitute_character();
847 mb_substitute_character('none');
848 $res = mb_convert_encoding($input, 'UTF-8', 'UTF-8');
849 mb_substitute_character($msch);
850
851 if ($res !== false) {
852 return $res;
853 }
854 }
855
856 if (function_exists('iconv')) {
857 if (($res = @iconv('UTF-8', 'UTF-8//IGNORE', $input)) !== false) {
858 return $res;
859 }
860 }
861
862 $seq = '';
863 $out = '';
864 $regexp = '/^('.
865 // '[\x00-\x7F]'. // UTF8-1
866 '|[\xC2-\xDF][\x80-\xBF]'. // UTF8-2
867 '|\xE0[\xA0-\xBF][\x80-\xBF]'. // UTF8-3
868 '|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]'. // UTF8-3
869 '|\xED[\x80-\x9F][\x80-\xBF]'. // UTF8-3
870 '|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]'. // UTF8-3
871 '|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]'. // UTF8-4
872 '|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]'.// UTF8-4
873 '|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]'. // UTF8-4
874 ')$/';
875
876 for ($i = 0, $len = strlen($input); $i < $len; $i++) {
877 $chr = $input[$i];
878 $ord = ord($chr);
879
880 // 1-byte character
881 if ($ord <= 0x7F) {
882 if ($seq !== '') {
883 $out .= preg_match($regexp, $seq) ? $seq : '';
884 $seq = '';
885 }
886
887 $out .= $chr;
888 }
889 // first byte of multibyte sequence
890 else if ($ord >= 0xC0) {
891 if ($seq !== '') {
892 $out .= preg_match($regexp, $seq) ? $seq : '';
893 $seq = '';
894 }
895
896 $seq = $chr;
897 }
898 // next byte of multibyte sequence
899 else if ($seq !== '') {
900 $seq .= $chr;
901 }
902 }
903
904 if ($seq !== '') {
905 $out .= preg_match($regexp, $seq) ? $seq : '';
906 }
907
908 return $out;
909 }
910 }