include/lib/utf8.php

   1 <?php
   2 /**
   3  * UTF8 helper functions
   4  *
   5  * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
   6  * @author     Andreas Gohr <andi@splitbrain.org>
   7  */
   8
   9 /**
  10  * check for mb_string support
  11  */
  12 if(!defined('UTF8_MBSTRING')){
  13   if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
  14     define('UTF8_MBSTRING',1);
  15   }else{
  16     define('UTF8_MBSTRING',0);
  17   }
  18 }
  19
  20 if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
  21
  22
  23 /**
  24  * URL-Encode a filename to allow unicodecharacters
  25  *
  26  * Slashes are not encoded
  27  *
  28  * When the second parameter is true the string will
  29  * be encoded only if non ASCII characters are detected -
  30  * This makes it safe to run it multiple times on the
  31  * same string (default is true)
  32  *
  33  * @author Andreas Gohr <andi@splitbrain.org>
  34  * @see    urlencode
  35  */
  36 function utf8_encodeFN($file,$safe=true){
  37   if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
  38     return $file;
  39   }
  40   $file = urlencode($file);
  41   $file = str_replace('%2F','/',$file);
  42   return $file;
  43 }
  44
  45 /**
  46  * URL-Decode a filename
  47  *
  48  * This is just a wrapper around urldecode
  49  *
  50  * @author Andreas Gohr <andi@splitbrain.org>
  51  * @see    urldecode
  52  */
  53 function utf8_decodeFN($file){
  54   $file = urldecode($file);
  55   return $file;
  56 }
  57
  58 /**
  59  * Checks if a string contains 7bit ASCII only
  60  *
  61  * @author Andreas Gohr <andi@splitbrain.org>
  62  */
  63 function utf8_isASCII($str){
  64   for($i=0; $i<strlen($str); $i++){
  65     if(ord($str{$i}) >127) return false;
  66   }
  67   return true;
  68 }
  69
  70 /**
  71  * Strips all highbyte chars
  72  *
  73  * Returns a pure ASCII7 string
  74  *
  75  * @author Andreas Gohr <andi@splitbrain.org>
  76  */
  77 function utf8_strip($str){
  78   $ascii = '';
  79   for($i=0; $i<strlen($str); $i++){
  80     if(ord($str{$i}) <128){
  81       $ascii .= $str{$i};
  82     }
  83   }
  84   return $ascii;
  85 }
  86
  87 /**
  88  * Tries to detect if a string is in Unicode encoding
  89  *
  90  * @author <bmorel@ssi.fr>
  91  * @link   http://www.php.net/manual/en/function.utf8-encode.php
  92  */
  93 function utf8_check($Str) {
  94  for ($i=0; $i<strlen($Str); $i++) {
  95   $b = ord($Str[$i]);
  96   if ($b < 0x80) continue; # 0bbbbbbb
  97   elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
  98   elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
  99   elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
 100   elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
 101   elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
 102   else return false; # Does not match any model
 103   for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
 104    if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
 105    return false;
 106   }
 107  }
 108  return true;
 109 }
 110
 111 /**
 112  * Unicode aware replacement for strlen()
 113  *
 114  * utf8_decode() converts characters that are not in ISO-8859-1
 115  * to '?', which, for the purpose of counting, is alright - It's
 116  * even faster than mb_strlen.
 117  *
 118  * @author <chernyshevsky at hotmail dot com>
 119  * @see    strlen()
 120  * @see    utf8_decode()
 121  */
 122 function utf8_strlen($string){
 123   return strlen(utf8_decode($string));
 124 }
 125
 126 /**
 127  * UTF-8 aware alternative to substr
 128  *
 129  * Return part of a string given character offset (and optionally length)
 130  *
 131  * @author Harry Fuecks <hfuecks@gmail.com>
 132  * @author Chris Smith <chris@jalakai.co.uk>
 133  * @param string
 134  * @param integer number of UTF-8 characters offset (from left)
 135  * @param integer (optional) length in UTF-8 characters from offset
 136  * @return mixed string or false if failure
 137  */
 138 function utf8_substr($str, $offset, $length = null) {
 139     if(UTF8_MBSTRING){
 140         if( $length === null ){
 141             return mb_substr($str, $offset);
 142         }else{
 143             return mb_substr($str, $offset, $length);
 144         }
 145     }
 146
 147     /*
 148      * Notes:
 149      *
 150      * no mb string support, so we'll use pcre regex's with 'u' flag
 151      * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
 152      * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
 153      *
 154      * substr documentation states false can be returned in some cases (e.g. offset > string length)
 155      * mb_substr never returns false, it will return an empty string instead.
 156      *
 157      * calculating the number of characters in the string is a relatively expensive operation, so
 158      * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
 159      */
 160
 161     // cast parameters to appropriate types to avoid multiple notices/warnings
 162     $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
 163     $offset = (int)$offset;
 164     if (!is_null($length)) $length = (int)$length;
 165
 166     // handle trivial cases
 167     if ($length === 0) return '';
 168     if ($offset < 0 && $length < 0 && $length < $offset) return '';
 169
 170     $offset_pattern = '';
 171     $length_pattern = '';
 172
 173     // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
 174     if ($offset < 0) {
 175       $strlen = strlen(utf8_decode($str));        // see notes
 176       $offset = $strlen + $offset;
 177       if ($offset < 0) $offset = 0;
 178     }
 179
 180     // establish a pattern for offset, a non-captured group equal in length to offset
 181     if ($offset > 0) {
 182       $Ox = (int)($offset/65535);
 183       $Oy = $offset%65535;
 184
 185       if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
 186       $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
 187     } else {
 188       $offset_pattern = '^';                      // offset == 0; just anchor the pattern
 189     }
 190
 191     // establish a pattern for length
 192     if (is_null($length)) {
 193       $length_pattern = '(.*)$';                  // the rest of the string
 194     } else {
 195
 196       if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
 197       if ($offset > $strlen) return '';           // another trivial case
 198
 199       if ($length > 0) {
 200
 201         $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
 202
 203         $Lx = (int)($length/65535);
 204         $Ly = $length%65535;
 205
 206         // +ve length requires ... a captured group of length characters
 207         if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
 208         $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
 209
 210       } else if ($length < 0) {
 211
 212         if ($length < ($offset - $strlen)) return '';
 213
 214         $Lx = (int)((-$length)/65535);
 215         $Ly = (-$length)%65535;
 216
 217         // -ve length requires ... capture everything except a group of -length characters
 218         //                         anchored at the tail-end of the string
 219         if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
 220         $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
 221       }
 222     }
 223
 224     if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
 225     return $match[1];
 226 }
 227
 228 /**
 229  * Unicode aware replacement for substr_replace()
 230  *
 231  * @author Andreas Gohr <andi@splitbrain.org>
 232  * @see    substr_replace()
 233  */
 234 function utf8_substr_replace($string, $replacement, $start , $length=0 ){
 235   $ret = '';
 236   if($start>0) $ret .= utf8_substr($string, 0, $start);
 237   $ret .= $replacement;
 238   $ret .= utf8_substr($string, $start+$length);
 239   return $ret;
 240 }
 241
 242
 243 /**
 244  * Unicode aware replacement for ltrim()
 245  *
 246  * @author Andreas Gohr <andi@splitbrain.org>
 247  * @see    ltrim()
 248  * @return string
 249  */
 250 function utf8_ltrim($str,$charlist=''){
 251   if($charlist == '') return ltrim($str);
 252
 253   //quote charlist for use in a characterclass
 254   $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
 255
 256   return preg_replace('/^['.$charlist.']+/u','',$str);
 257 }
 258
 259 /**
 260  * Unicode aware replacement for rtrim()
 261  *
 262  * @author Andreas Gohr <andi@splitbrain.org>
 263  * @see    rtrim()
 264  * @return string
 265  */
 266 function  utf8_rtrim($str,$charlist=''){
 267   if($charlist == '') return rtrim($str);
 268
 269   //quote charlist for use in a characterclass
 270   $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\${1}',$charlist);
 271
 272   return preg_replace('/['.$charlist.']+$/u','',$str);
 273 }
 274
 275 /**
 276  * Unicode aware replacement for trim()
 277  *
 278  * @author Andreas Gohr <andi@splitbrain.org>
 279  * @see    trim()
 280  * @return string
 281  */
 282 function  utf8_trim($str,$charlist='') {
 283   if($charlist == '') return trim($str);
 284
 285   return utf8_ltrim(utf8_rtrim($str));
 286 }
 287
 288
 289 /**
 290  * This is a unicode aware replacement for strtolower()
 291  *
 292  * Uses mb_string extension if available
 293  *
 294  * @author Leo Feyer <leo@typolight.org>
 295  * @see    strtolower()
 296  * @see    utf8_strtoupper()
 297  */
 298 function utf8_strtolower($string){
 299   if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
 300
 301   global $UTF8_UPPER_TO_LOWER;
 302   return strtr($string,$UTF8_UPPER_TO_LOWER);
 303 }
 304
 305 /**
 306  * This is a unicode aware replacement for strtoupper()
 307  *
 308  * Uses mb_string extension if available
 309  *
 310  * @author Leo Feyer <leo@typolight.org>
 311  * @see    strtoupper()
 312  * @see    utf8_strtoupper()
 313  */
 314 function utf8_strtoupper($string){
 315   if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
 316
 317   global $UTF8_LOWER_TO_UPPER;
 318   return strtr($string,$UTF8_LOWER_TO_UPPER);
 319 }
 320
 321 /**
 322  * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
 323  *
 324  * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
 325  * letters. Default is to deaccent both cases ($case = 0)
 326  *
 327  * @author Andreas Gohr <andi@splitbrain.org>
 328  */
 329 function utf8_deaccent($string,$case=0){
 330   if($case <= 0){
 331     global $UTF8_LOWER_ACCENTS;
 332     $string = strtr($string,$UTF8_LOWER_ACCENTS);
 333   }
 334   if($case >= 0){
 335     global $UTF8_UPPER_ACCENTS;
 336     $string = strtr($string,$UTF8_UPPER_ACCENTS);
 337   }
 338   return $string;
 339 }
 340
 341 /**
 342  * Romanize a non-latin string
 343  *
 344  * @author Andreas Gohr <andi@splitbrain.org>
 345  */
 346 function utf8_romanize($string){
 347   if(utf8_isASCII($string)) return $string; //nothing to do
 348
 349   global $UTF8_ROMANIZATION;
 350   return strtr($string,$UTF8_ROMANIZATION);
 351 }
 352
 353 /**
 354  * Removes special characters (nonalphanumeric) from a UTF-8 string
 355  *
 356  * This function adds the controlchars 0x00 to 0x19 to the array of
 357  * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
 358  *
 359  * @author Andreas Gohr <andi@splitbrain.org>
 360  * @param  string $string     The UTF8 string to strip of special chars
 361  * @param  string $repl       Replace special with this string
 362  * @param  string $additional Additional chars to strip (used in regexp char class)
 363  */
 364 function utf8_stripspecials($string,$repl='',$additional=''){
 365   global $UTF8_SPECIAL_CHARS;
 366   global $UTF8_SPECIAL_CHARS2;
 367
 368   static $specials = null;
 369   if(is_null($specials)){
 370 #    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
 371     $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
 372   }
 373
 374   return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
 375 }
 376
 377 /**
 378  * This is an Unicode aware replacement for strpos
 379  *
 380  * @author Leo Feyer <leo@typolight.org>
 381  * @see    strpos()
 382  * @param  string
 383  * @param  string
 384  * @param  integer
 385  * @return integer
 386  */
 387 function utf8_strpos($haystack, $needle, $offset=0){
 388     $comp = 0;
 389     $length = null;
 390
 391     while (is_null($length) || $length < $offset) {
 392         $pos = strpos($haystack, $needle, $offset + $comp);
 393
 394         if ($pos === false)
 395             return false;
 396
 397         $length = utf8_strlen(substr($haystack, 0, $pos));
 398
 399         if ($length < $offset)
 400             $comp = $pos - $length;
 401     }
 402
 403     return $length;
 404 }
 405
 406
 407 /**
 408  * This is an Unicode aware replacement for strrpos.
 409  * Based on utf8_strpos written by Leo
 410  *
 411  * @author Harris Wong <harris.wong@utoronto.ca>
 412  * @see    strrpos()
 413  * @param  string
 414  * @param  string
 415  * @param  integer
 416  * @return integer
 417  */
 418 function utf8_strrpos($haystack, $needle, $offset=0){
 419     $comp = 0;
 420     $length = null;
 421
 422     while (is_null($length) || $length < $offset) {
 423         $pos = strrpos($haystack, $needle, $offset + $comp);
 424
 425         if ($pos === false)
 426             return false;
 427
 428         $length = utf8_strlen(substr($haystack, 0, $pos));
 429
 430         if ($length < $offset)
 431             $comp = $pos - $length;
 432     }
 433
 434     return $length;
 435 }
 436
 437
 438 /**
 439  * Encodes UTF-8 characters to HTML entities
 440  *
 441  * @author Tom N Harris <tnharris@whoopdedo.org>
 442  * @author <vpribish at shopping dot com>
 443  * @link   http://www.php.net/manual/en/function.utf8-decode.php
 444  */
 445 function utf8_tohtml ($str) {
 446     $ret = '';
 447     foreach (utf8_to_unicode($str) as $cp) {
 448         if ($cp < 0x80)
 449             $ret .= chr($cp);
 450         elseif ($cp < 0x100)
 451             $ret .= "&#$cp;";
 452         else
 453             $ret .= '&#x'.dechex($cp).';';
 454     }
 455     return $ret;
 456 }
 457
 458 /**
 459  * Decodes HTML entities to UTF-8 characters
 460  *
 461  * Convert any &#..; entity to a codepoint,
 462  * The entities flag defaults to only decoding numeric entities.
 463  * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
 464  * are handled as well. Avoids the problem that would occur if you
 465  * had to decode "&amp;#38;&#38;amp;#38;"
 466  *
 467  * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
 468  * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
 469  * what it should be                   -> "&#38;&amp#38;"
 470  *
 471  * @author Tom N Harris <tnharris@whoopdedo.org>
 472  * @param  string  $str      UTF-8 encoded string
 473  * @param  boolean $entities Flag controlling decoding of named entities.
 474  * @return UTF-8 encoded string with numeric (and named) entities replaced.
 475  */
 476 function utf8_unhtml($str, $entities=null) {
 477     static $decoder = null;
 478     if (is_null($decoder))
 479       $decoder = new utf8_entity_decoder();
 480     if (is_null($entities))
 481         return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
 482                                      'utf8_decode_numeric', $str);
 483     else
 484         return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
 485                                      array(&$decoder, 'decode'), $str);
 486 }
 487 function utf8_decode_numeric($ent) {
 488     switch ($ent[2]) {
 489       case 'X':
 490       case 'x':
 491           $cp = hexdec($ent[3]);
 492           break;
 493       default:
 494           $cp = intval($ent[3]);
 495           break;
 496     }
 497     return unicode_to_utf8(array($cp));
 498 }
 499 class utf8_entity_decoder {
 500     var $table;
 501     function utf8_entity_decoder() {
 502         $table = get_html_translation_table(HTML_ENTITIES);
 503         $table = array_flip($table);
 504         $this->table = array_map(array(&$this,'makeutf8'), $table);
 505     }
 506     function makeutf8($c) {
 507         return unicode_to_utf8(array(ord($c)));
 508     }
 509     function decode($ent) {
 510         if ($ent[1] == '#') {
 511             return utf8_decode_numeric($ent);
 512         } elseif (array_key_exists($ent[0],$this->table)) {
 513             return $this->table[$ent[0]];
 514         } else {
 515             return $ent[0];
 516         }
 517     }
 518 }
 519
 520 /**
 521  * Takes an UTF-8 string and returns an array of ints representing the
 522  * Unicode characters. Astral planes are supported ie. the ints in the
 523  * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 524  * are not allowed.
 525  *
 526  * If $strict is set to true the function returns false if the input
 527  * string isn't a valid UTF-8 octet sequence and raises a PHP error at
 528  * level E_USER_WARNING
 529  *
 530  * Note: this function has been modified slightly in this library to
 531  * trigger errors on encountering bad bytes
 532  *
 533  * @author <hsivonen@iki.fi>
 534  * @author Harry Fuecks <hfuecks@gmail.com>
 535  * @param  string  UTF-8 encoded string
 536  * @param  boolean Check for invalid sequences?
 537  * @return mixed array of unicode code points or false if UTF-8 invalid
 538  * @see    unicode_to_utf8
 539  * @link   http://hsivonen.iki.fi/php-utf8/
 540  * @link   http://sourceforge.net/projects/phputf8/
 541  */
 542 function utf8_to_unicode($str,$strict=false) {
 543     $mState = 0;     // cached expected number of octets after the current octet
 544                      // until the beginning of the next UTF8 character sequence
 545     $mUcs4  = 0;     // cached Unicode character
 546     $mBytes = 1;     // cached expected number of octets in the current sequence
 547
 548     $out = array();
 549
 550     $len = strlen($str);
 551
 552     for($i = 0; $i < $len; $i++) {
 553
 554         $in = ord($str{$i});
 555
 556         if ( $mState == 0) {
 557
 558             // When mState is zero we expect either a US-ASCII character or a
 559             // multi-octet sequence.
 560             if (0 == (0x80 & ($in))) {
 561                 // US-ASCII, pass straight through.
 562                 $out[] = $in;
 563                 $mBytes = 1;
 564
 565             } else if (0xC0 == (0xE0 & ($in))) {
 566                 // First octet of 2 octet sequence
 567                 $mUcs4 = ($in);
 568                 $mUcs4 = ($mUcs4 & 0x1F) << 6;
 569                 $mState = 1;
 570                 $mBytes = 2;
 571
 572             } else if (0xE0 == (0xF0 & ($in))) {
 573                 // First octet of 3 octet sequence
 574                 $mUcs4 = ($in);
 575                 $mUcs4 = ($mUcs4 & 0x0F) << 12;
 576                 $mState = 2;
 577                 $mBytes = 3;
 578
 579             } else if (0xF0 == (0xF8 & ($in))) {
 580                 // First octet of 4 octet sequence
 581                 $mUcs4 = ($in);
 582                 $mUcs4 = ($mUcs4 & 0x07) << 18;
 583                 $mState = 3;
 584                 $mBytes = 4;
 585
 586             } else if (0xF8 == (0xFC & ($in))) {
 587                 /* First octet of 5 octet sequence.
 588                  *
 589                  * This is illegal because the encoded codepoint must be either
 590                  * (a) not the shortest form or
 591                  * (b) outside the Unicode range of 0-0x10FFFF.
 592                  * Rather than trying to resynchronize, we will carry on until the end
 593                  * of the sequence and let the later error handling code catch it.
 594                  */
 595                 $mUcs4 = ($in);
 596                 $mUcs4 = ($mUcs4 & 0x03) << 24;
 597                 $mState = 4;
 598                 $mBytes = 5;
 599
 600             } else if (0xFC == (0xFE & ($in))) {
 601                 // First octet of 6 octet sequence, see comments for 5 octet sequence.
 602                 $mUcs4 = ($in);
 603                 $mUcs4 = ($mUcs4 & 1) << 30;
 604                 $mState = 5;
 605                 $mBytes = 6;
 606
 607             } elseif($strict) {
 608                 /* Current octet is neither in the US-ASCII range nor a legal first
 609                  * octet of a multi-octet sequence.
 610                  */
 611                 trigger_error(
 612                         'utf8_to_unicode: Illegal sequence identifier '.
 613                             'in UTF-8 at byte '.$i,
 614                         E_USER_WARNING
 615                     );
 616                 return false;
 617
 618             }
 619
 620         } else {
 621
 622             // When mState is non-zero, we expect a continuation of the multi-octet
 623             // sequence
 624             if (0x80 == (0xC0 & ($in))) {
 625
 626                 // Legal continuation.
 627                 $shift = ($mState - 1) * 6;
 628                 $tmp = $in;
 629                 $tmp = ($tmp & 0x0000003F) << $shift;
 630                 $mUcs4 |= $tmp;
 631
 632                 /**
 633                  * End of the multi-octet sequence. mUcs4 now contains the final
 634                  * Unicode codepoint to be output
 635                  */
 636                 if (0 == --$mState) {
 637
 638                     /*
 639                      * Check for illegal sequences and codepoints.
 640                      */
 641                     // From Unicode 3.1, non-shortest form is illegal
 642                     if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 643                         ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 644                         ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 645                         (4 < $mBytes) ||
 646                         // From Unicode 3.2, surrogate characters are illegal
 647                         (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 648                         // Codepoints outside the Unicode range are illegal
 649                         ($mUcs4 > 0x10FFFF)) {
 650
 651                         if($strict){
 652                             trigger_error(
 653                                     'utf8_to_unicode: Illegal sequence or codepoint '.
 654                                         'in UTF-8 at byte '.$i,
 655                                     E_USER_WARNING
 656                                 );
 657
 658                             return false;
 659                         }
 660
 661                     }
 662
 663                     if (0xFEFF != $mUcs4) {
 664                         // BOM is legal but we don't want to output it
 665                         $out[] = $mUcs4;
 666                     }
 667
 668                     //initialize UTF8 cache
 669                     $mState = 0;
 670                     $mUcs4  = 0;
 671                     $mBytes = 1;
 672                 }
 673
 674             } elseif($strict) {
 675                 /**
 676                  *((0xC0 & (*in) != 0x80) && (mState != 0))
 677                  * Incomplete multi-octet sequence.
 678                  */
 679                 trigger_error(
 680                         'utf8_to_unicode: Incomplete multi-octet '.
 681                         '   sequence in UTF-8 at byte '.$i,
 682                         E_USER_WARNING
 683                     );
 684
 685                 return false;
 686             }
 687         }
 688     }
 689     return $out;
 690 }
 691
 692 /**
 693  * Takes an array of ints representing the Unicode characters and returns
 694  * a UTF-8 string. Astral planes are supported ie. the ints in the
 695  * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 696  * are not allowed.
 697  *
 698  * If $strict is set to true the function returns false if the input
 699  * array contains ints that represent surrogates or are outside the
 700  * Unicode range and raises a PHP error at level E_USER_WARNING
 701  *
 702  * Note: this function has been modified slightly in this library to use
 703  * output buffering to concatenate the UTF-8 string (faster) as well as
 704  * reference the array by it's keys
 705  *
 706  * @param  array of unicode code points representing a string
 707  * @param  boolean Check for invalid sequences?
 708  * @return mixed UTF-8 string or false if array contains invalid code points
 709  * @author <hsivonen@iki.fi>
 710  * @author Harry Fuecks <hfuecks@gmail.com>
 711  * @see    utf8_to_unicode
 712  * @link   http://hsivonen.iki.fi/php-utf8/
 713  * @link   http://sourceforge.net/projects/phputf8/
 714  */
 715 function unicode_to_utf8($arr,$strict=false) {
 716     if (!is_array($arr)) return '';
 717     ob_start();
 718
 719     foreach (array_keys($arr) as $k) {
 720
 721         # ASCII range (including control chars)
 722         if ( ($arr[$k] >= 0) && ($arr[$k] <= 0x007f) ) {
 723
 724             echo chr($arr[$k]);
 725
 726         # 2 byte sequence
 727         } else if ($arr[$k] <= 0x07ff) {
 728
 729             echo chr(0xc0 | ($arr[$k] >> 6));
 730             echo chr(0x80 | ($arr[$k] & 0x003f));
 731
 732         # Byte order mark (skip)
 733         } else if($arr[$k] == 0xFEFF) {
 734
 735             // nop -- zap the BOM
 736
 737         # Test for illegal surrogates
 738         } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
 739
 740             // found a surrogate
 741             if($strict){
 742                 trigger_error(
 743                     'unicode_to_utf8: Illegal surrogate '.
 744                         'at index: '.$k.', value: '.$arr[$k],
 745                     E_USER_WARNING
 746                     );
 747                 return false;
 748             }
 749
 750         # 3 byte sequence
 751         } else if ($arr[$k] <= 0xffff) {
 752
 753             echo chr(0xe0 | ($arr[$k] >> 12));
 754             echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
 755             echo chr(0x80 | ($arr[$k] & 0x003f));
 756
 757         # 4 byte sequence
 758         } else if ($arr[$k] <= 0x10ffff) {
 759
 760             echo chr(0xf0 | ($arr[$k] >> 18));
 761             echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
 762             echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
 763             echo chr(0x80 | ($arr[$k] & 0x3f));
 764
 765         } elseif($strict) {
 766
 767             trigger_error(
 768                 'unicode_to_utf8: Codepoint out of Unicode range '.
 769                     'at index: '.$k.', value: '.$arr[$k],
 770                 E_USER_WARNING
 771                 );
 772
 773             // out of range
 774             return false;
 775         }
 776     }
 777
 778     $result = ob_get_contents();
 779     ob_end_clean();
 780     return $result;
 781 }
 782
 783 /**
 784  * UTF-8 to UTF-16BE conversion.
 785  *
 786  * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 787  */
 788 function utf8_to_utf16be(&$str, $bom = false) {
 789   $out = $bom ? "\xFE\xFF" : '';
 790   if(UTF8_MBSTRING) return $out.mb_convert_encoding($str,'UTF-16BE','UTF-8');
 791
 792   $uni = utf8_to_unicode($str);
 793   foreach($uni as $cp){
 794     $out .= pack('n',$cp);
 795   }
 796   return $out;
 797 }
 798
 799 /**
 800  * UTF-8 to UTF-16BE conversion.
 801  *
 802  * Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
 803  */
 804 function utf16be_to_utf8(&$str) {
 805   $uni = unpack('n*',$str);
 806   return unicode_to_utf8($uni);
 807 }
 808
 809 /**
 810  * Replace bad bytes with an alternative character
 811  *
 812  * ASCII character is recommended for replacement char
 813  *
 814  * PCRE Pattern to locate bad bytes in a UTF-8 string
 815  * Comes from W3 FAQ: Multilingual Forms
 816  * Note: modified to include full ASCII range including control chars
 817  *
 818  * @author Harry Fuecks <hfuecks@gmail.com>
 819  * @see http://www.w3.org/International/questions/qa-forms-utf-8
 820  * @param string to search
 821  * @param string to replace bad bytes with (defaults to '?') - use ASCII
 822  * @return string
 823  */
 824 function utf8_bad_replace($str, $replace = '') {
 825     $UTF8_BAD =
 826      '([\x00-\x7F]'.                          # ASCII (including control chars)
 827      '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
 828      '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
 829      '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
 830      '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
 831      '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
 832      '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
 833      '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
 834      '|(.{1}))';                              # invalid byte
 835     ob_start();
 836     while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
 837         if ( !isset($matches[2])) {
 838             echo $matches[0];
 839         } else {
 840             echo $replace;
 841         }
 842         $str = substr($str,strlen($matches[0]));
 843     }
 844     $result = ob_get_contents();
 845     ob_end_clean();
 846     return $result;
 847 }
 848
 849 /**
 850  * adjust a byte index into a utf8 string to a utf8 character boundary
 851  *
 852  * @param $str   string   utf8 character string
 853  * @param $i     int      byte index into $str
 854  * @param $next  bool     direction to search for boundary,
 855  *                           false = up (current character)
 856  *                           true = down (next character)
 857  *
 858  * @return int            byte index into $str now pointing to a utf8 character boundary
 859  *
 860  * @author       chris smith <chris@jalakai.co.uk>
 861  */
 862 function utf8_correctIdx(&$str,$i,$next=false) {
 863
 864   if ($i <= 0) return 0;
 865
 866   $limit = strlen($str);
 867   if ($i>=$limit) return $limit;
 868
 869   if ($next) {
 870     while (($i<$limit) && ((ord($str[$i]) & 0xC0) == 0x80)) $i++;
 871   } else {
 872     while ($i && ((ord($str[$i]) & 0xC0) == 0x80)) $i--;
 873   }
 874
 875   return $i;
 876 }
 877
 878 // only needed if no mb_string available
 879 if(!UTF8_MBSTRING){
 880   /**
 881    * UTF-8 Case lookup table
 882    *
 883    * This lookuptable defines the upper case letters to their correspponding
 884    * lower case letter in UTF-8
 885    *
 886    * @author Andreas Gohr <andi@splitbrain.org>
 887    */
 888   global $UTF8_LOWER_TO_UPPER;
 889   $UTF8_LOWER_TO_UPPER = array(
 890     "ｚ"=>"Ｚ","ｙ"=>"Ｙ","ｘ"=>"Ｘ","ｗ"=>"Ｗ","ｖ"=>"Ｖ","ｕ"=>"Ｕ","ｔ"=>"Ｔ","ｓ"=>"Ｓ","ｒ"=>"Ｒ","ｑ"=>"Ｑ",
 891     "ｐ"=>"Ｐ","ｏ"=>"Ｏ","ｎ"=>"Ｎ","ｍ"=>"Ｍ","ｌ"=>"Ｌ","ｋ"=>"Ｋ","ｊ"=>"Ｊ","ｉ"=>"Ｉ","ｈ"=>"Ｈ","ｇ"=>"Ｇ",
 892     "ｆ"=>"Ｆ","ｅ"=>"Ｅ","ｄ"=>"Ｄ","ｃ"=>"Ｃ","ｂ"=>"Ｂ","ａ"=>"Ａ","ῳ"=>"ῼ","ῥ"=>"Ῥ","ῡ"=>"Ῡ","ῑ"=>"Ῑ",
 893     "ῐ"=>"Ῐ","ῃ"=>"ῌ","ι"=>"Ι","ᾳ"=>"ᾼ","ᾱ"=>"Ᾱ","ᾰ"=>"Ᾰ","ᾧ"=>"ᾯ","ᾦ"=>"ᾮ","ᾥ"=>"ᾭ","ᾤ"=>"ᾬ",
 894     "ᾣ"=>"ᾫ","ᾢ"=>"ᾪ","ᾡ"=>"ᾩ","ᾗ"=>"ᾟ","ᾖ"=>"ᾞ","ᾕ"=>"ᾝ","ᾔ"=>"ᾜ","ᾓ"=>"ᾛ","ᾒ"=>"ᾚ","ᾑ"=>"ᾙ",
 895     "ᾐ"=>"ᾘ","ᾇ"=>"ᾏ","ᾆ"=>"ᾎ","ᾅ"=>"ᾍ","ᾄ"=>"ᾌ","ᾃ"=>"ᾋ","ᾂ"=>"ᾊ","ᾁ"=>"ᾉ","ᾀ"=>"ᾈ","ώ"=>"Ώ",
 896     "ὼ"=>"Ὼ","ύ"=>"Ύ","ὺ"=>"Ὺ","ό"=>"Ό","ὸ"=>"Ὸ","ί"=>"Ί","ὶ"=>"Ὶ","ή"=>"Ή","ὴ"=>"Ὴ","έ"=>"Έ",
 897     "ὲ"=>"Ὲ","ά"=>"Ά","ὰ"=>"Ὰ","ὧ"=>"Ὧ","ὦ"=>"Ὦ","ὥ"=>"Ὥ","ὤ"=>"Ὤ","ὣ"=>"Ὣ","ὢ"=>"Ὢ","ὡ"=>"Ὡ",
 898     "ὗ"=>"Ὗ","ὕ"=>"Ὕ","ὓ"=>"Ὓ","ὑ"=>"Ὑ","ὅ"=>"Ὅ","ὄ"=>"Ὄ","ὃ"=>"Ὃ","ὂ"=>"Ὂ","ὁ"=>"Ὁ","ὀ"=>"Ὀ",
 899     "ἷ"=>"Ἷ","ἶ"=>"Ἶ","ἵ"=>"Ἵ","ἴ"=>"Ἴ","ἳ"=>"Ἳ","ἲ"=>"Ἲ","ἱ"=>"Ἱ","ἰ"=>"Ἰ","ἧ"=>"Ἧ","ἦ"=>"Ἦ",
 900     "ἥ"=>"Ἥ","ἤ"=>"Ἤ","ἣ"=>"Ἣ","ἢ"=>"Ἢ","ἡ"=>"Ἡ","ἕ"=>"Ἕ","ἔ"=>"Ἔ","ἓ"=>"Ἓ","ἒ"=>"Ἒ","ἑ"=>"Ἑ",
 901     "ἐ"=>"Ἐ","ἇ"=>"Ἇ","ἆ"=>"Ἆ","ἅ"=>"Ἅ","ἄ"=>"Ἄ","ἃ"=>"Ἃ","ἂ"=>"Ἂ","ἁ"=>"Ἁ","ἀ"=>"Ἀ","ỹ"=>"Ỹ",
 902     "ỷ"=>"Ỷ","ỵ"=>"Ỵ","ỳ"=>"Ỳ","ự"=>"Ự","ữ"=>"Ữ","ử"=>"Ử","ừ"=>"Ừ","ứ"=>"Ứ","ủ"=>"Ủ","ụ"=>"Ụ",
 903     "ợ"=>"Ợ","ỡ"=>"Ỡ","ở"=>"Ở","ờ"=>"Ờ","ớ"=>"Ớ","ộ"=>"Ộ","ỗ"=>"Ỗ","ổ"=>"Ổ","ồ"=>"Ồ","ố"=>"Ố",
 904     "ỏ"=>"Ỏ","ọ"=>"Ọ","ị"=>"Ị","ỉ"=>"Ỉ","ệ"=>"Ệ","ễ"=>"Ễ","ể"=>"Ể","ề"=>"Ề","ế"=>"Ế","ẽ"=>"Ẽ",
 905     "ẻ"=>"Ẻ","ẹ"=>"Ẹ","ặ"=>"Ặ","ẵ"=>"Ẵ","ẳ"=>"Ẳ","ằ"=>"Ằ","ắ"=>"Ắ","ậ"=>"Ậ","ẫ"=>"Ẫ","ẩ"=>"Ẩ",
 906     "ầ"=>"Ầ","ấ"=>"Ấ","ả"=>"Ả","ạ"=>"Ạ","ẛ"=>"Ṡ","ẕ"=>"Ẕ","ẓ"=>"Ẓ","ẑ"=>"Ẑ","ẏ"=>"Ẏ","ẍ"=>"Ẍ",
 907     "ẋ"=>"Ẋ","ẉ"=>"Ẉ","ẇ"=>"Ẇ","ẅ"=>"Ẅ","ẃ"=>"Ẃ","ẁ"=>"Ẁ","ṿ"=>"Ṿ","ṽ"=>"Ṽ","ṻ"=>"Ṻ","ṹ"=>"Ṹ",
 908     "ṷ"=>"Ṷ","ṵ"=>"Ṵ","ṳ"=>"Ṳ","ṱ"=>"Ṱ","ṯ"=>"Ṯ","ṭ"=>"Ṭ","ṫ"=>"Ṫ","ṩ"=>"Ṩ","ṧ"=>"Ṧ","ṥ"=>"Ṥ",
 909     "ṣ"=>"Ṣ","ṡ"=>"Ṡ","ṟ"=>"Ṟ","ṝ"=>"Ṝ","ṛ"=>"Ṛ","ṙ"=>"Ṙ","ṗ"=>"Ṗ","ṕ"=>"Ṕ","ṓ"=>"Ṓ","ṑ"=>"Ṑ",
 910     "ṏ"=>"Ṏ","ṍ"=>"Ṍ","ṋ"=>"Ṋ","ṉ"=>"Ṉ","ṇ"=>"Ṇ","ṅ"=>"Ṅ","ṃ"=>"Ṃ","ṁ"=>"Ṁ","ḿ"=>"Ḿ","ḽ"=>"Ḽ",
 911     "ḻ"=>"Ḻ","ḹ"=>"Ḹ","ḷ"=>"Ḷ","ḵ"=>"Ḵ","ḳ"=>"Ḳ","ḱ"=>"Ḱ","ḯ"=>"Ḯ","ḭ"=>"Ḭ","ḫ"=>"Ḫ","ḩ"=>"Ḩ",
 912     "ḧ"=>"Ḧ","ḥ"=>"Ḥ","ḣ"=>"Ḣ","ḡ"=>"Ḡ","ḟ"=>"Ḟ","ḝ"=>"Ḝ","ḛ"=>"Ḛ","ḙ"=>"Ḙ","ḗ"=>"Ḗ","ḕ"=>"Ḕ",
 913     "ḓ"=>"Ḓ","ḑ"=>"Ḑ","ḏ"=>"Ḏ","ḍ"=>"Ḍ","ḋ"=>"Ḋ","ḉ"=>"Ḉ","ḇ"=>"Ḇ","ḅ"=>"Ḅ","ḃ"=>"Ḃ","ḁ"=>"Ḁ",
 914     "ֆ"=>"Ֆ","օ"=>"Օ","ք"=>"Ք","փ"=>"Փ","ւ"=>"Ւ","ց"=>"Ց","ր"=>"Ր","տ"=>"Տ","վ"=>"Վ","ս"=>"Ս",
 915     "ռ"=>"Ռ","ջ"=>"Ջ","պ"=>"Պ","չ"=>"Չ","ո"=>"Ո","շ"=>"Շ","ն"=>"Ն","յ"=>"Յ","մ"=>"Մ","ճ"=>"Ճ",
 916     "ղ"=>"Ղ","ձ"=>"Ձ","հ"=>"Հ","կ"=>"Կ","ծ"=>"Ծ","խ"=>"Խ","լ"=>"Լ","ի"=>"Ի","ժ"=>"Ժ","թ"=>"Թ",
 917     "ը"=>"Ը","է"=>"Է","զ"=>"Զ","ե"=>"Ե","դ"=>"Դ","գ"=>"Գ","բ"=>"Բ","ա"=>"Ա","ԏ"=>"Ԏ","ԍ"=>"Ԍ",
 918     "ԋ"=>"Ԋ","ԉ"=>"Ԉ","ԇ"=>"Ԇ","ԅ"=>"Ԅ","ԃ"=>"Ԃ","ԁ"=>"Ԁ","ӹ"=>"Ӹ","ӵ"=>"Ӵ","ӳ"=>"Ӳ","ӱ"=>"Ӱ",
 919     "ӯ"=>"Ӯ","ӭ"=>"Ӭ","ӫ"=>"Ӫ","ө"=>"Ө","ӧ"=>"Ӧ","ӥ"=>"Ӥ","ӣ"=>"Ӣ","ӡ"=>"Ӡ","ӟ"=>"Ӟ","ӝ"=>"Ӝ",
 920     "ӛ"=>"Ӛ","ә"=>"Ә","ӗ"=>"Ӗ","ӕ"=>"Ӕ","ӓ"=>"Ӓ","ӑ"=>"Ӑ","ӎ"=>"Ӎ","ӌ"=>"Ӌ","ӊ"=>"Ӊ","ӈ"=>"Ӈ",
 921     "ӆ"=>"Ӆ","ӄ"=>"Ӄ","ӂ"=>"Ӂ","ҿ"=>"Ҿ","ҽ"=>"Ҽ","һ"=>"Һ","ҹ"=>"Ҹ","ҷ"=>"Ҷ","ҵ"=>"Ҵ","ҳ"=>"Ҳ",
 922     "ұ"=>"Ұ","ү"=>"Ү","ҭ"=>"Ҭ","ҫ"=>"Ҫ","ҩ"=>"Ҩ","ҧ"=>"Ҧ","ҥ"=>"Ҥ","ң"=>"Ң","ҡ"=>"Ҡ","ҟ"=>"Ҟ",
 923     "ҝ"=>"Ҝ","қ"=>"Қ","ҙ"=>"Ҙ","җ"=>"Җ","ҕ"=>"Ҕ","ғ"=>"Ғ","ґ"=>"Ґ","ҏ"=>"Ҏ","ҍ"=>"Ҍ","ҋ"=>"Ҋ",
 924     "ҁ"=>"Ҁ","ѿ"=>"Ѿ","ѽ"=>"Ѽ","ѻ"=>"Ѻ","ѹ"=>"Ѹ","ѷ"=>"Ѷ","ѵ"=>"Ѵ","ѳ"=>"Ѳ","ѱ"=>"Ѱ","ѯ"=>"Ѯ",
 925     "ѭ"=>"Ѭ","ѫ"=>"Ѫ","ѩ"=>"Ѩ","ѧ"=>"Ѧ","ѥ"=>"Ѥ","ѣ"=>"Ѣ","ѡ"=>"Ѡ","џ"=>"Џ","ў"=>"Ў","ѝ"=>"Ѝ",
 926     "ќ"=>"Ќ","ћ"=>"Ћ","њ"=>"Њ","љ"=>"Љ","ј"=>"Ј","ї"=>"Ї","і"=>"І","ѕ"=>"Ѕ","є"=>"Є","ѓ"=>"Ѓ",
 927     "ђ"=>"Ђ","ё"=>"Ё","ѐ"=>"Ѐ","я"=>"Я","ю"=>"Ю","э"=>"Э","ь"=>"Ь","ы"=>"Ы","ъ"=>"Ъ","щ"=>"Щ",
 928     "ш"=>"Ш","ч"=>"Ч","ц"=>"Ц","х"=>"Х","ф"=>"Ф","у"=>"У","т"=>"Т","с"=>"С","р"=>"Р","п"=>"П",
 929     "о"=>"О","н"=>"Н","м"=>"М","л"=>"Л","к"=>"К","й"=>"Й","и"=>"И","з"=>"З","ж"=>"Ж","е"=>"Е",
 930     "д"=>"Д","г"=>"Г","в"=>"В","б"=>"Б","а"=>"А","ϵ"=>"Ε","ϲ"=>"Σ","ϱ"=>"Ρ","ϰ"=>"Κ","ϯ"=>"Ϯ",
 931     "ϭ"=>"Ϭ","ϫ"=>"Ϫ","ϩ"=>"Ϩ","ϧ"=>"Ϧ","ϥ"=>"Ϥ","ϣ"=>"Ϣ","ϡ"=>"Ϡ","ϟ"=>"Ϟ","ϝ"=>"Ϝ","ϛ"=>"Ϛ",
 932     "ϙ"=>"Ϙ","ϖ"=>"Π","ϕ"=>"Φ","ϑ"=>"Θ","ϐ"=>"Β","ώ"=>"Ώ","ύ"=>"Ύ","ό"=>"Ό","ϋ"=>"Ϋ","ϊ"=>"Ϊ",
 933     "ω"=>"Ω","ψ"=>"Ψ","χ"=>"Χ","φ"=>"Φ","υ"=>"Υ","τ"=>"Τ","σ"=>"Σ","ς"=>"Σ","ρ"=>"Ρ","π"=>"Π",
 934     "ο"=>"Ο","ξ"=>"Ξ","ν"=>"Ν","μ"=>"Μ","λ"=>"Λ","κ"=>"Κ","ι"=>"Ι","θ"=>"Θ","η"=>"Η","ζ"=>"Ζ",
 935     "ε"=>"Ε","δ"=>"Δ","γ"=>"Γ","β"=>"Β","α"=>"Α","ί"=>"Ί","ή"=>"Ή","έ"=>"Έ","ά"=>"Ά","ʒ"=>"Ʒ",
 936     "ʋ"=>"Ʋ","ʊ"=>"Ʊ","ʈ"=>"Ʈ","ʃ"=>"Ʃ","ʀ"=>"Ʀ","ɵ"=>"Ɵ","ɲ"=>"Ɲ","ɯ"=>"Ɯ","ɩ"=>"Ɩ","ɨ"=>"Ɨ",
 937     "ɣ"=>"Ɣ","ɛ"=>"Ɛ","ə"=>"Ə","ɗ"=>"Ɗ","ɖ"=>"Ɖ","ɔ"=>"Ɔ","ɓ"=>"Ɓ","ȳ"=>"Ȳ","ȱ"=>"Ȱ","ȯ"=>"Ȯ",
 938     "ȭ"=>"Ȭ","ȫ"=>"Ȫ","ȩ"=>"Ȩ","ȧ"=>"Ȧ","ȥ"=>"Ȥ","ȣ"=>"Ȣ","ȟ"=>"Ȟ","ȝ"=>"Ȝ","ț"=>"Ț","ș"=>"Ș",
 939     "ȗ"=>"Ȗ","ȕ"=>"Ȕ","ȓ"=>"Ȓ","ȑ"=>"Ȑ","ȏ"=>"Ȏ","ȍ"=>"Ȍ","ȋ"=>"Ȋ","ȉ"=>"Ȉ","ȇ"=>"Ȇ","ȅ"=>"Ȅ",
 940     "ȃ"=>"Ȃ","ȁ"=>"Ȁ","ǿ"=>"Ǿ","ǽ"=>"Ǽ","ǻ"=>"Ǻ","ǹ"=>"Ǹ","ǵ"=>"Ǵ","ǳ"=>"ǲ","ǯ"=>"Ǯ","ǭ"=>"Ǭ",
 941     "ǫ"=>"Ǫ","ǩ"=>"Ǩ","ǧ"=>"Ǧ","ǥ"=>"Ǥ","ǣ"=>"Ǣ","ǡ"=>"Ǡ","ǟ"=>"Ǟ","ǝ"=>"Ǝ","ǜ"=>"Ǜ","ǚ"=>"Ǚ",
 942     "ǘ"=>"Ǘ","ǖ"=>"Ǖ","ǔ"=>"Ǔ","ǒ"=>"Ǒ","ǐ"=>"Ǐ","ǎ"=>"Ǎ","ǌ"=>"ǋ","ǉ"=>"ǈ","ǆ"=>"ǅ","ƿ"=>"Ƿ",
 943     "ƽ"=>"Ƽ","ƹ"=>"Ƹ","ƶ"=>"Ƶ","ƴ"=>"Ƴ","ư"=>"Ư","ƭ"=>"Ƭ","ƨ"=>"Ƨ","ƥ"=>"Ƥ","ƣ"=>"Ƣ","ơ"=>"Ơ",
 944     "ƞ"=>"Ƞ","ƙ"=>"Ƙ","ƕ"=>"Ƕ","ƒ"=>"Ƒ","ƌ"=>"Ƌ","ƈ"=>"Ƈ","ƅ"=>"Ƅ","ƃ"=>"Ƃ","ſ"=>"S","ž"=>"Ž",
 945     "ż"=>"Ż","ź"=>"Ź","ŷ"=>"Ŷ","ŵ"=>"Ŵ","ų"=>"Ų","ű"=>"Ű","ů"=>"Ů","ŭ"=>"Ŭ","ū"=>"Ū","ũ"=>"Ũ",
 946     "ŧ"=>"Ŧ","ť"=>"Ť","ţ"=>"Ţ","š"=>"Š","ş"=>"Ş","ŝ"=>"Ŝ","ś"=>"Ś","ř"=>"Ř","ŗ"=>"Ŗ","ŕ"=>"Ŕ",
 947     "œ"=>"Œ","ő"=>"Ő","ŏ"=>"Ŏ","ō"=>"Ō","ŋ"=>"Ŋ","ň"=>"Ň","ņ"=>"Ņ","ń"=>"Ń","ł"=>"Ł","ŀ"=>"Ŀ",
 948     "ľ"=>"Ľ","ļ"=>"Ļ","ĺ"=>"Ĺ","ķ"=>"Ķ","ĵ"=>"Ĵ","ĳ"=>"Ĳ","ı"=>"I","į"=>"Į","ĭ"=>"Ĭ","ī"=>"Ī",
 949     "ĩ"=>"Ĩ","ħ"=>"Ħ","ĥ"=>"Ĥ","ģ"=>"Ģ","ġ"=>"Ġ","ğ"=>"Ğ","ĝ"=>"Ĝ","ě"=>"Ě","ę"=>"Ę","ė"=>"Ė",
 950     "ĕ"=>"Ĕ","ē"=>"Ē","đ"=>"Đ","ď"=>"Ď","č"=>"Č","ċ"=>"Ċ","ĉ"=>"Ĉ","ć"=>"Ć","ą"=>"Ą","ă"=>"Ă",
 951     "ā"=>"Ā","ÿ"=>"Ÿ","þ"=>"Þ","ý"=>"Ý","ü"=>"Ü","û"=>"Û","ú"=>"Ú","ù"=>"Ù","ø"=>"Ø","ö"=>"Ö",
 952     "õ"=>"Õ","ô"=>"Ô","ó"=>"Ó","ò"=>"Ò","ñ"=>"Ñ","ð"=>"Ð","ï"=>"Ï","î"=>"Î","í"=>"Í","ì"=>"Ì",
 953     "ë"=>"Ë","ê"=>"Ê","é"=>"É","è"=>"È","ç"=>"Ç","æ"=>"Æ","å"=>"Å","ä"=>"Ä","ã"=>"Ã","â"=>"Â",
 954     "á"=>"Á","à"=>"À","µ"=>"Μ","z"=>"Z","y"=>"Y","x"=>"X","w"=>"W","v"=>"V","u"=>"U","t"=>"T",
 955     "s"=>"S","r"=>"R","q"=>"Q","p"=>"P","o"=>"O","n"=>"N","m"=>"M","l"=>"L","k"=>"K","j"=>"J",
 956     "i"=>"I","h"=>"H","g"=>"G","f"=>"F","e"=>"E","d"=>"D","c"=>"C","b"=>"B","a"=>"A"
 957   );
 958
 959   /**
 960    * UTF-8 Case lookup table
 961    *
 962    * This lookuptable defines the lower case letters to their correspponding
 963    * upper case letter in UTF-8
 964    *
 965    * @author Andreas Gohr <andi@splitbrain.org>
 966    */
 967   global $UTF8_UPPER_TO_LOWER;
 968   $UTF8_UPPER_TO_LOWER = array (
 969     "Ｚ"=>"ｚ","Ｙ"=>"ｙ","Ｘ"=>"ｘ","Ｗ"=>"ｗ","Ｖ"=>"ｖ","Ｕ"=>"ｕ","Ｔ"=>"ｔ","Ｓ"=>"ｓ","Ｒ"=>"ｒ","Ｑ"=>"ｑ",
 970     "Ｐ"=>"ｐ","Ｏ"=>"ｏ","Ｎ"=>"ｎ","Ｍ"=>"ｍ","Ｌ"=>"ｌ","Ｋ"=>"ｋ","Ｊ"=>"ｊ","Ｉ"=>"ｉ","Ｈ"=>"ｈ","Ｇ"=>"ｇ",
 971     "Ｆ"=>"ｆ","Ｅ"=>"ｅ","Ｄ"=>"ｄ","Ｃ"=>"ｃ","Ｂ"=>"ｂ","Ａ"=>"ａ","ῼ"=>"ῳ","Ῥ"=>"ῥ","Ῡ"=>"ῡ","Ῑ"=>"ῑ",
 972     "Ῐ"=>"ῐ","ῌ"=>"ῃ","Ι"=>"ι","ᾼ"=>"ᾳ","Ᾱ"=>"ᾱ","Ᾰ"=>"ᾰ","ᾯ"=>"ᾧ","ᾮ"=>"ᾦ","ᾭ"=>"ᾥ","ᾬ"=>"ᾤ",
 973     "ᾫ"=>"ᾣ","ᾪ"=>"ᾢ","ᾩ"=>"ᾡ","ᾟ"=>"ᾗ","ᾞ"=>"ᾖ","ᾝ"=>"ᾕ","ᾜ"=>"ᾔ","ᾛ"=>"ᾓ","ᾚ"=>"ᾒ","ᾙ"=>"ᾑ",
 974     "ᾘ"=>"ᾐ","ᾏ"=>"ᾇ","ᾎ"=>"ᾆ","ᾍ"=>"ᾅ","ᾌ"=>"ᾄ","ᾋ"=>"ᾃ","ᾊ"=>"ᾂ","ᾉ"=>"ᾁ","ᾈ"=>"ᾀ","Ώ"=>"ώ",
 975     "Ὼ"=>"ὼ","Ύ"=>"ύ","Ὺ"=>"ὺ","Ό"=>"ό","Ὸ"=>"ὸ","Ί"=>"ί","Ὶ"=>"ὶ","Ή"=>"ή","Ὴ"=>"ὴ","Έ"=>"έ",
 976     "Ὲ"=>"ὲ","Ά"=>"ά","Ὰ"=>"ὰ","Ὧ"=>"ὧ","Ὦ"=>"ὦ","Ὥ"=>"ὥ","Ὤ"=>"ὤ","Ὣ"=>"ὣ","Ὢ"=>"ὢ","Ὡ"=>"ὡ",
 977     "Ὗ"=>"ὗ","Ὕ"=>"ὕ","Ὓ"=>"ὓ","Ὑ"=>"ὑ","Ὅ"=>"ὅ","Ὄ"=>"ὄ","Ὃ"=>"ὃ","Ὂ"=>"ὂ","Ὁ"=>"ὁ","Ὀ"=>"ὀ",
 978     "Ἷ"=>"ἷ","Ἶ"=>"ἶ","Ἵ"=>"ἵ","Ἴ"=>"ἴ","Ἳ"=>"ἳ","Ἲ"=>"ἲ","Ἱ"=>"ἱ","Ἰ"=>"ἰ","Ἧ"=>"ἧ","Ἦ"=>"ἦ",
 979     "Ἥ"=>"ἥ","Ἤ"=>"ἤ","Ἣ"=>"ἣ","Ἢ"=>"ἢ","Ἡ"=>"ἡ","Ἕ"=>"ἕ","Ἔ"=>"ἔ","Ἓ"=>"ἓ","Ἒ"=>"ἒ","Ἑ"=>"ἑ",
 980     "Ἐ"=>"ἐ","Ἇ"=>"ἇ","Ἆ"=>"ἆ","Ἅ"=>"ἅ","Ἄ"=>"ἄ","Ἃ"=>"ἃ","Ἂ"=>"ἂ","Ἁ"=>"ἁ","Ἀ"=>"ἀ","Ỹ"=>"ỹ",
 981     "Ỷ"=>"ỷ","Ỵ"=>"ỵ","Ỳ"=>"ỳ","Ự"=>"ự","Ữ"=>"ữ","Ử"=>"ử","Ừ"=>"ừ","Ứ"=>"ứ","Ủ"=>"ủ","Ụ"=>"ụ",
 982     "Ợ"=>"ợ","Ỡ"=>"ỡ","Ở"=>"ở","Ờ"=>"ờ","Ớ"=>"ớ","Ộ"=>"ộ","Ỗ"=>"ỗ","Ổ"=>"ổ","Ồ"=>"ồ","Ố"=>"ố",
 983     "Ỏ"=>"ỏ","Ọ"=>"ọ","Ị"=>"ị","Ỉ"=>"ỉ","Ệ"=>"ệ","Ễ"=>"ễ","Ể"=>"ể","Ề"=>"ề","Ế"=>"ế","Ẽ"=>"ẽ",
 984     "Ẻ"=>"ẻ","Ẹ"=>"ẹ","Ặ"=>"ặ","Ẵ"=>"ẵ","Ẳ"=>"ẳ","Ằ"=>"ằ","Ắ"=>"ắ","Ậ"=>"ậ","Ẫ"=>"ẫ","Ẩ"=>"ẩ",
 985     "Ầ"=>"ầ","Ấ"=>"ấ","Ả"=>"ả","Ạ"=>"ạ","Ṡ"=>"ẛ","Ẕ"=>"ẕ","Ẓ"=>"ẓ","Ẑ"=>"ẑ","Ẏ"=>"ẏ","Ẍ"=>"ẍ",
 986     "Ẋ"=>"ẋ","Ẉ"=>"ẉ","Ẇ"=>"ẇ","Ẅ"=>"ẅ","Ẃ"=>"ẃ","Ẁ"=>"ẁ","Ṿ"=>"ṿ","Ṽ"=>"ṽ","Ṻ"=>"ṻ","Ṹ"=>"ṹ",
 987     "Ṷ"=>"ṷ","Ṵ"=>"ṵ","Ṳ"=>"ṳ","Ṱ"=>"ṱ","Ṯ"=>"ṯ","Ṭ"=>"ṭ","Ṫ"=>"ṫ","Ṩ"=>"ṩ","Ṧ"=>"ṧ","Ṥ"=>"ṥ",
 988     "Ṣ"=>"ṣ","Ṡ"=>"ṡ","Ṟ"=>"ṟ","Ṝ"=>"ṝ","Ṛ"=>"ṛ","Ṙ"=>"ṙ","Ṗ"=>"ṗ","Ṕ"=>"ṕ","Ṓ"=>"ṓ","Ṑ"=>"ṑ",
 989     "Ṏ"=>"ṏ","Ṍ"=>"ṍ","Ṋ"=>"ṋ","Ṉ"=>"ṉ","Ṇ"=>"ṇ","Ṅ"=>"ṅ","Ṃ"=>"ṃ","Ṁ"=>"ṁ","Ḿ"=>"ḿ","Ḽ"=>"ḽ",
 990     "Ḻ"=>"ḻ","Ḹ"=>"ḹ","Ḷ"=>"ḷ","Ḵ"=>"ḵ","Ḳ"=>"ḳ","Ḱ"=>"ḱ","Ḯ"=>"ḯ","Ḭ"=>"ḭ","Ḫ"=>"ḫ","Ḩ"=>"ḩ",
 991     "Ḧ"=>"ḧ","Ḥ"=>"ḥ","Ḣ"=>"ḣ","Ḡ"=>"ḡ","Ḟ"=>"ḟ","Ḝ"=>"ḝ","Ḛ"=>"ḛ","Ḙ"=>"ḙ","Ḗ"=>"ḗ","Ḕ"=>"ḕ",
 992     "Ḓ"=>"ḓ","Ḑ"=>"ḑ","Ḏ"=>"ḏ","Ḍ"=>"ḍ","Ḋ"=>"ḋ","Ḉ"=>"ḉ","Ḇ"=>"ḇ","Ḅ"=>"ḅ","Ḃ"=>"ḃ","Ḁ"=>"ḁ",
 993     "Ֆ"=>"ֆ","Օ"=>"օ","Ք"=>"ք","Փ"=>"փ","Ւ"=>"ւ","Ց"=>"ց","Ր"=>"ր","Տ"=>"տ","Վ"=>"վ","Ս"=>"ս",
 994     "Ռ"=>"ռ","Ջ"=>"ջ","Պ"=>"պ","Չ"=>"չ","Ո"=>"ո","Շ"=>"շ","Ն"=>"ն","Յ"=>"յ","Մ"=>"մ","Ճ"=>"ճ",
 995     "Ղ"=>"ղ","Ձ"=>"ձ","Հ"=>"հ","Կ"=>"կ","Ծ"=>"ծ","Խ"=>"խ","Լ"=>"լ","Ի"=>"ի","Ժ"=>"ժ","Թ"=>"թ",
 996     "Ը"=>"ը","Է"=>"է","Զ"=>"զ","Ե"=>"ե","Դ"=>"դ","Գ"=>"գ","Բ"=>"բ","Ա"=>"ա","Ԏ"=>"ԏ","Ԍ"=>"ԍ",
 997     "Ԋ"=>"ԋ","Ԉ"=>"ԉ","Ԇ"=>"ԇ","Ԅ"=>"ԅ","Ԃ"=>"ԃ","Ԁ"=>"ԁ","Ӹ"=>"ӹ","Ӵ"=>"ӵ","Ӳ"=>"ӳ","Ӱ"=>"ӱ",
 998     "Ӯ"=>"ӯ","Ӭ"=>"ӭ","Ӫ"=>"ӫ","Ө"=>"ө","Ӧ"=>"ӧ","Ӥ"=>"ӥ","Ӣ"=>"ӣ","Ӡ"=>"ӡ","Ӟ"=>"ӟ","Ӝ"=>"ӝ",
 999     "Ӛ"=>"ӛ","Ә"=>"ә","Ӗ"=>"ӗ","Ӕ"=>"ӕ","Ӓ"=>"ӓ","Ӑ"=>"ӑ","Ӎ"=>"ӎ","Ӌ"=>"ӌ","Ӊ"=>"ӊ","Ӈ"=>"ӈ",
1000     "Ӆ"=>"ӆ","Ӄ"=>"ӄ","Ӂ"=>"ӂ","Ҿ"=>"ҿ","Ҽ"=>"ҽ","Һ"=>"һ","Ҹ"=>"ҹ","Ҷ"=>"ҷ","Ҵ"=>"ҵ","Ҳ"=>"ҳ",
1001     "Ұ"=>"ұ","Ү"=>"ү","Ҭ"=>"ҭ","Ҫ"=>"ҫ","Ҩ"=>"ҩ","Ҧ"=>"ҧ","Ҥ"=>"ҥ","Ң"=>"ң","Ҡ"=>"ҡ","Ҟ"=>"ҟ",
1002     "Ҝ"=>"ҝ","Қ"=>"қ","Ҙ"=>"ҙ","Җ"=>"җ","Ҕ"=>"ҕ","Ғ"=>"ғ","Ґ"=>"ґ","Ҏ"=>"ҏ","Ҍ"=>"ҍ","Ҋ"=>"ҋ",
1003     "Ҁ"=>"ҁ","Ѿ"=>"ѿ","Ѽ"=>"ѽ","Ѻ"=>"ѻ","Ѹ"=>"ѹ","Ѷ"=>"ѷ","Ѵ"=>"ѵ","Ѳ"=>"ѳ","Ѱ"=>"ѱ","Ѯ"=>"ѯ",
1004     "Ѭ"=>"ѭ","Ѫ"=>"ѫ","Ѩ"=>"ѩ","Ѧ"=>"ѧ","Ѥ"=>"ѥ","Ѣ"=>"ѣ","Ѡ"=>"ѡ","Џ"=>"џ","Ў"=>"ў","Ѝ"=>"ѝ",
1005     "Ќ"=>"ќ","Ћ"=>"ћ","Њ"=>"њ","Љ"=>"љ","Ј"=>"ј","Ї"=>"ї","І"=>"і","Ѕ"=>"ѕ","Є"=>"є","Ѓ"=>"ѓ",
1006     "Ђ"=>"ђ","Ё"=>"ё","Ѐ"=>"ѐ","Я"=>"я","Ю"=>"ю","Э"=>"э","Ь"=>"ь","Ы"=>"ы","Ъ"=>"ъ","Щ"=>"щ",
1007     "Ш"=>"ш","Ч"=>"ч","Ц"=>"ц","Х"=>"х","Ф"=>"ф","У"=>"у","Т"=>"т","С"=>"с","Р"=>"р","П"=>"п",
1008     "О"=>"о","Н"=>"н","М"=>"м","Л"=>"л","К"=>"к","Й"=>"й","И"=>"и","З"=>"з","Ж"=>"ж","Е"=>"е",
1009     "Д"=>"д","Г"=>"г","В"=>"в","Б"=>"б","А"=>"а","Ε"=>"ϵ","Σ"=>"ϲ","Ρ"=>"ϱ","Κ"=>"ϰ","Ϯ"=>"ϯ",
1010     "Ϭ"=>"ϭ","Ϫ"=>"ϫ","Ϩ"=>"ϩ","Ϧ"=>"ϧ","Ϥ"=>"ϥ","Ϣ"=>"ϣ","Ϡ"=>"ϡ","Ϟ"=>"ϟ","Ϝ"=>"ϝ","Ϛ"=>"ϛ",
1011     "Ϙ"=>"ϙ","Π"=>"ϖ","Φ"=>"ϕ","Θ"=>"ϑ","Β"=>"ϐ","Ώ"=>"ώ","Ύ"=>"ύ","Ό"=>"ό","Ϋ"=>"ϋ","Ϊ"=>"ϊ",
1012     "Ω"=>"ω","Ψ"=>"ψ","Χ"=>"χ","Φ"=>"φ","Υ"=>"υ","Τ"=>"τ","Σ"=>"σ","Σ"=>"ς","Ρ"=>"ρ","Π"=>"π",
1013     "Ο"=>"ο","Ξ"=>"ξ","Ν"=>"ν","Μ"=>"μ","Λ"=>"λ","Κ"=>"κ","Ι"=>"ι","Θ"=>"θ","Η"=>"η","Ζ"=>"ζ",
1014     "Ε"=>"ε","Δ"=>"δ","Γ"=>"γ","Β"=>"β","Α"=>"α","Ί"=>"ί","Ή"=>"ή","Έ"=>"έ","Ά"=>"ά","Ʒ"=>"ʒ",
1015     "Ʋ"=>"ʋ","Ʊ"=>"ʊ","Ʈ"=>"ʈ","Ʃ"=>"ʃ","Ʀ"=>"ʀ","Ɵ"=>"ɵ","Ɲ"=>"ɲ","Ɯ"=>"ɯ","Ɩ"=>"ɩ","Ɨ"=>"ɨ",
1016     "Ɣ"=>"ɣ","Ɛ"=>"ɛ","Ə"=>"ə","Ɗ"=>"ɗ","Ɖ"=>"ɖ","Ɔ"=>"ɔ","Ɓ"=>"ɓ","Ȳ"=>"ȳ","Ȱ"=>"ȱ","Ȯ"=>"ȯ",
1017     "Ȭ"=>"ȭ","Ȫ"=>"ȫ","Ȩ"=>"ȩ","Ȧ"=>"ȧ","Ȥ"=>"ȥ","Ȣ"=>"ȣ","Ȟ"=>"ȟ","Ȝ"=>"ȝ","Ț"=>"ț","Ș"=>"ș",
1018     "Ȗ"=>"ȗ","Ȕ"=>"ȕ","Ȓ"=>"ȓ","Ȑ"=>"ȑ","Ȏ"=>"ȏ","Ȍ"=>"ȍ","Ȋ"=>"ȋ","Ȉ"=>"ȉ","Ȇ"=>"ȇ","Ȅ"=>"ȅ",
1019     "Ȃ"=>"ȃ","Ȁ"=>"ȁ","Ǿ"=>"ǿ","Ǽ"=>"ǽ","Ǻ"=>"ǻ","Ǹ"=>"ǹ","Ǵ"=>"ǵ","ǲ"=>"ǳ","Ǯ"=>"ǯ","Ǭ"=>"ǭ",
1020     "Ǫ"=>"ǫ","Ǩ"=>"ǩ","Ǧ"=>"ǧ","Ǥ"=>"ǥ","Ǣ"=>"ǣ","Ǡ"=>"ǡ","Ǟ"=>"ǟ","Ǝ"=>"ǝ","Ǜ"=>"ǜ","Ǚ"=>"ǚ",
1021     "Ǘ"=>"ǘ","Ǖ"=>"ǖ","Ǔ"=>"ǔ","Ǒ"=>"ǒ","Ǐ"=>"ǐ","Ǎ"=>"ǎ","ǋ"=>"ǌ","ǈ"=>"ǉ","ǅ"=>"ǆ","Ƿ"=>"ƿ",
1022     "Ƽ"=>"ƽ","Ƹ"=>"ƹ","Ƶ"=>"ƶ","Ƴ"=>"ƴ","Ư"=>"ư","Ƭ"=>"ƭ","Ƨ"=>"ƨ","Ƥ"=>"ƥ","Ƣ"=>"ƣ","Ơ"=>"ơ",
1023     "Ƞ"=>"ƞ","Ƙ"=>"ƙ","Ƕ"=>"ƕ","Ƒ"=>"ƒ","Ƌ"=>"ƌ","Ƈ"=>"ƈ","Ƅ"=>"ƅ","Ƃ"=>"ƃ","S"=>"ſ","Ž"=>"ž",
1024     "Ż"=>"ż","Ź"=>"ź","Ŷ"=>"ŷ","Ŵ"=>"ŵ","Ų"=>"ų","Ű"=>"ű","Ů"=>"ů","Ŭ"=>"ŭ","Ū"=>"ū","Ũ"=>"ũ",
1025     "Ŧ"=>"ŧ","Ť"=>"ť","Ţ"=>"ţ","Š"=>"š","Ş"=>"ş","Ŝ"=>"ŝ","Ś"=>"ś","Ř"=>"ř","Ŗ"=>"ŗ","Ŕ"=>"ŕ",
1026     "Œ"=>"œ","Ő"=>"ő","Ŏ"=>"ŏ","Ō"=>"ō","Ŋ"=>"ŋ","Ň"=>"ň","Ņ"=>"ņ","Ń"=>"ń","Ł"=>"ł","Ŀ"=>"ŀ",
1027     "Ľ"=>"ľ","Ļ"=>"ļ","Ĺ"=>"ĺ","Ķ"=>"ķ","Ĵ"=>"ĵ","Ĳ"=>"ĳ","I"=>"ı","Į"=>"į","Ĭ"=>"ĭ","Ī"=>"ī",
1028     "Ĩ"=>"ĩ","Ħ"=>"ħ","Ĥ"=>"ĥ","Ģ"=>"ģ","Ġ"=>"ġ","Ğ"=>"ğ","Ĝ"=>"ĝ","Ě"=>"ě","Ę"=>"ę","Ė"=>"ė",
1029     "Ĕ"=>"ĕ","Ē"=>"ē","Đ"=>"đ","Ď"=>"ď","Č"=>"č","Ċ"=>"ċ","Ĉ"=>"ĉ","Ć"=>"ć","Ą"=>"ą","Ă"=>"ă",
1030     "Ā"=>"ā","Ÿ"=>"ÿ","Þ"=>"þ","Ý"=>"ý","Ü"=>"ü","Û"=>"û","Ú"=>"ú","Ù"=>"ù","Ø"=>"ø","Ö"=>"ö",
1031     "Õ"=>"õ","Ô"=>"ô","Ó"=>"ó","Ò"=>"ò","Ñ"=>"ñ","Ð"=>"ð","Ï"=>"ï","Î"=>"î","Í"=>"í","Ì"=>"ì",
1032     "Ë"=>"ë","Ê"=>"ê","É"=>"é","È"=>"è","Ç"=>"ç","Æ"=>"æ","Å"=>"å","Ä"=>"ä","Ã"=>"ã","Â"=>"â",
1033     "Á"=>"á","À"=>"à","Μ"=>"µ","Z"=>"z","Y"=>"y","X"=>"x","W"=>"w","V"=>"v","U"=>"u","T"=>"t",
1034     "S"=>"s","R"=>"r","Q"=>"q","P"=>"p","O"=>"o","N"=>"n","M"=>"m","L"=>"l","K"=>"k","J"=>"j",
1035     "I"=>"i","H"=>"h","G"=>"g","F"=>"f","E"=>"e","D"=>"d","C"=>"c","B"=>"b","A"=>"a"
1036   );
1037 }; // end of case lookup tables
1038
1039 /**
1040  * UTF-8 lookup table for lower case accented letters
1041  *
1042  * This lookuptable defines replacements for accented characters from the ASCII-7
1043  * range. This are lower case letters only.
1044  *
1045  * @author Andreas Gohr <andi@splitbrain.org>
1046  * @see    utf8_deaccent()
1047  */
1048 global $UTF8_LOWER_ACCENTS;
1049 $UTF8_LOWER_ACCENTS = array(
1050   'à' => 'a', 'ô' => 'o', 'ď' => 'd', 'ḟ' => 'f', 'ë' => 'e', 'š' => 's', 'ơ' => 'o',
1051   'ß' => 'ss', 'ă' => 'a', 'ř' => 'r', 'ț' => 't', 'ň' => 'n', 'ā' => 'a', 'ķ' => 'k',
1052   'ŝ' => 's', 'ỳ' => 'y', 'ņ' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'ṗ' => 'p', 'ó' => 'o',
1053   'ú' => 'u', 'ě' => 'e', 'é' => 'e', 'ç' => 'c', 'ẁ' => 'w', 'ċ' => 'c', 'õ' => 'o',
1054   'ṡ' => 's', 'ø' => 'o', 'ģ' => 'g', 'ŧ' => 't', 'ș' => 's', 'ė' => 'e', 'ĉ' => 'c',
1055   'ś' => 's', 'î' => 'i', 'ű' => 'u', 'ć' => 'c', 'ę' => 'e', 'ŵ' => 'w', 'ṫ' => 't',
1056   'ū' => 'u', 'č' => 'c', 'ö' => 'oe', 'è' => 'e', 'ŷ' => 'y', 'ą' => 'a', 'ł' => 'l',
1057   'ų' => 'u', 'ů' => 'u', 'ş' => 's', 'ğ' => 'g', 'ļ' => 'l', 'ƒ' => 'f', 'ž' => 'z',
1058   'ẃ' => 'w', 'ḃ' => 'b', 'å' => 'a', 'ì' => 'i', 'ï' => 'i', 'ḋ' => 'd', 'ť' => 't',
1059   'ŗ' => 'r', 'ä' => 'ae', 'í' => 'i', 'ŕ' => 'r', 'ê' => 'e', 'ü' => 'ue', 'ò' => 'o',
1060   'ē' => 'e', 'ñ' => 'n', 'ń' => 'n', 'ĥ' => 'h', 'ĝ' => 'g', 'đ' => 'd', 'ĵ' => 'j',
1061   'ÿ' => 'y', 'ũ' => 'u', 'ŭ' => 'u', 'ư' => 'u', 'ţ' => 't', 'ý' => 'y', 'ő' => 'o',
1062   'â' => 'a', 'ľ' => 'l', 'ẅ' => 'w', 'ż' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g',
1063   'ṁ' => 'm', 'ō' => 'o', 'ĩ' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a',
1064   'û' => 'u', 'þ' => 'th', 'ð' => 'dh', 'æ' => 'ae', 'µ' => 'u', 'ĕ' => 'e',
1065 );
1066
1067 /**
1068  * UTF-8 lookup table for upper case accented letters
1069  *
1070  * This lookuptable defines replacements for accented characters from the ASCII-7
1071  * range. This are upper case letters only.
1072  *
1073  * @author Andreas Gohr <andi@splitbrain.org>
1074  * @see    utf8_deaccent()
1075  */
1076 global $UTF8_UPPER_ACCENTS;
1077 $UTF8_UPPER_ACCENTS = array(
1078   'À' => 'A', 'Ô' => 'O', 'Ď' => 'D', 'Ḟ' => 'F', 'Ë' => 'E', 'Š' => 'S', 'Ơ' => 'O',
1079   'Ă' => 'A', 'Ř' => 'R', 'Ț' => 'T', 'Ň' => 'N', 'Ā' => 'A', 'Ķ' => 'K',
1080   'Ŝ' => 'S', 'Ỳ' => 'Y', 'Ņ' => 'N', 'Ĺ' => 'L', 'Ħ' => 'H', 'Ṗ' => 'P', 'Ó' => 'O',
1081   'Ú' => 'U', 'Ě' => 'E', 'É' => 'E', 'Ç' => 'C', 'Ẁ' => 'W', 'Ċ' => 'C', 'Õ' => 'O',
1082   'Ṡ' => 'S', 'Ø' => 'O', 'Ģ' => 'G', 'Ŧ' => 'T', 'Ș' => 'S', 'Ė' => 'E', 'Ĉ' => 'C',
1083   'Ś' => 'S', 'Î' => 'I', 'Ű' => 'U', 'Ć' => 'C', 'Ę' => 'E', 'Ŵ' => 'W', 'Ṫ' => 'T',
1084   'Ū' => 'U', 'Č' => 'C', 'Ö' => 'Oe', 'È' => 'E', 'Ŷ' => 'Y', 'Ą' => 'A', 'Ł' => 'L',
1085   'Ų' => 'U', 'Ů' => 'U', 'Ş' => 'S', 'Ğ' => 'G', 'Ļ' => 'L', 'Ƒ' => 'F', 'Ž' => 'Z',
1086   'Ẃ' => 'W', 'Ḃ' => 'B', 'Å' => 'A', 'Ì' => 'I', 'Ï' => 'I', 'Ḋ' => 'D', 'Ť' => 'T',
1087   'Ŗ' => 'R', 'Ä' => 'Ae', 'Í' => 'I', 'Ŕ' => 'R', 'Ê' => 'E', 'Ü' => 'Ue', 'Ò' => 'O',
1088   'Ē' => 'E', 'Ñ' => 'N', 'Ń' => 'N', 'Ĥ' => 'H', 'Ĝ' => 'G', 'Đ' => 'D', 'Ĵ' => 'J',
1089   'Ÿ' => 'Y', 'Ũ' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Ţ' => 'T', 'Ý' => 'Y', 'Ő' => 'O',
1090   'Â' => 'A', 'Ľ' => 'L', 'Ẅ' => 'W', 'Ż' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ġ' => 'G',
1091   'Ṁ' => 'M', 'Ō' => 'O', 'Ĩ' => 'I', 'Ù' => 'U', 'Į' => 'I', 'Ź' => 'Z', 'Á' => 'A',
1092   'Û' => 'U', 'Þ' => 'Th', 'Ð' => 'Dh', 'Æ' => 'Ae', 'Ĕ' => 'E',
1093 );
1094
1095 /**
1096  * UTF-8 array of common special characters
1097  *
1098  * This array should contain all special characters (not a letter or digit)
1099  * defined in the various local charsets - it's not a complete list of non-alphanum
1100  * characters in UTF-8. It's not perfect but should match most cases of special
1101  * chars.
1102  *
1103  * The controlchars 0x00 to 0x19 are _not_ included in this array. The space 0x20 is!
1104  * These chars are _not_ in the array either:  _ (0x5f), : 0x3a, . 0x2e, - 0x2d, * 0x2a
1105  *
1106  * @author Andreas Gohr <andi@splitbrain.org>
1107  * @see    utf8_stripspecials()
1108  */
1109 global $UTF8_SPECIAL_CHARS;
1110 $UTF8_SPECIAL_CHARS = array(
1111   0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 0x0020, 0x0021, 0x0022, 0x0023,
1112   0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029,         0x002b, 0x002c,
1113           0x002f,         0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 0x0040, 0x005b,
1114   0x005c, 0x005d, 0x005e,         0x0060, 0x007b, 0x007c, 0x007d, 0x007e,
1115   0x007f, 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088,
1116   0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 0x0090, 0x0091, 0x0092,
1117   0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c,
1118   0x009d, 0x009e, 0x009f, 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6,
1119   0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 0x00b0,
1120   0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba,
1121   0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 0x00d7, 0x00f7, 0x02c7, 0x02d8, 0x02d9,
1122   0x02da, 0x02db, 0x02dc, 0x02dd, 0x0300, 0x0301, 0x0303, 0x0309, 0x0323, 0x0384,
1123   0x0385, 0x0387, 0x03b2, 0x03c6, 0x03d1, 0x03d2, 0x03d5, 0x03d6, 0x05b0, 0x05b1,
1124   0x05b2, 0x05b3, 0x05b4, 0x05b5, 0x05b6, 0x05b7, 0x05b8, 0x05b9, 0x05bb, 0x05bc,
1125   0x05bd, 0x05be, 0x05bf, 0x05c0, 0x05c1, 0x05c2, 0x05c3, 0x05f3, 0x05f4, 0x060c,
1126   0x061b, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
1127   0x0652, 0x066a, 0x0e3f, 0x200c, 0x200d, 0x200e, 0x200f, 0x2013, 0x2014, 0x2015,
1128   0x2017, 0x2018, 0x2019, 0x201a, 0x201c, 0x201d, 0x201e, 0x2020, 0x2021, 0x2022,
1129   0x2026, 0x2030, 0x2032, 0x2033, 0x2039, 0x203a, 0x2044, 0x20a7, 0x20aa, 0x20ab,
1130   0x20ac, 0x2116, 0x2118, 0x2122, 0x2126, 0x2135, 0x2190, 0x2191, 0x2192, 0x2193,
1131   0x2194, 0x2195, 0x21b5, 0x21d0, 0x21d1, 0x21d2, 0x21d3, 0x21d4, 0x2200, 0x2202,
1132   0x2203, 0x2205, 0x2206, 0x2207, 0x2208, 0x2209, 0x220b, 0x220f, 0x2211, 0x2212,
1133   0x2215, 0x2217, 0x2219, 0x221a, 0x221d, 0x221e, 0x2220, 0x2227, 0x2228, 0x2229,
1134   0x222a, 0x222b, 0x2234, 0x223c, 0x2245, 0x2248, 0x2260, 0x2261, 0x2264, 0x2265,
1135   0x2282, 0x2283, 0x2284, 0x2286, 0x2287, 0x2295, 0x2297, 0x22a5, 0x22c5, 0x2310,
1136   0x2320, 0x2321, 0x2329, 0x232a, 0x2469, 0x2500, 0x2502, 0x250c, 0x2510, 0x2514,
1137   0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, 0x2551, 0x2552, 0x2553,
1138   0x2554, 0x2555, 0x2556, 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d,
1139   0x255e, 0x255f, 0x2560, 0x2561, 0x2562, 0x2563, 0x2564, 0x2565, 0x2566, 0x2567,
1140   0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x2580, 0x2584, 0x2588, 0x258c, 0x2590,
1141   0x2591, 0x2592, 0x2593, 0x25a0, 0x25b2, 0x25bc, 0x25c6, 0x25ca, 0x25cf, 0x25d7,
1142   0x2605, 0x260e, 0x261b, 0x261e, 0x2660, 0x2663, 0x2665, 0x2666, 0x2701, 0x2702,
1143   0x2703, 0x2704, 0x2706, 0x2707, 0x2708, 0x2709, 0x270c, 0x270d, 0x270e, 0x270f,
1144   0x2710, 0x2711, 0x2712, 0x2713, 0x2714, 0x2715, 0x2716, 0x2717, 0x2718, 0x2719,
1145   0x271a, 0x271b, 0x271c, 0x271d, 0x271e, 0x271f, 0x2720, 0x2721, 0x2722, 0x2723,
1146   0x2724, 0x2725, 0x2726, 0x2727, 0x2729, 0x272a, 0x272b, 0x272c, 0x272d, 0x272e,
1147   0x272f, 0x2730, 0x2731, 0x2732, 0x2733, 0x2734, 0x2735, 0x2736, 0x2737, 0x2738,
1148   0x2739, 0x273a, 0x273b, 0x273c, 0x273d, 0x273e, 0x273f, 0x2740, 0x2741, 0x2742,
1149   0x2743, 0x2744, 0x2745, 0x2746, 0x2747, 0x2748, 0x2749, 0x274a, 0x274b, 0x274d,
1150   0x274f, 0x2750, 0x2751, 0x2752, 0x2756, 0x2758, 0x2759, 0x275a, 0x275b, 0x275c,
1151   0x275d, 0x275e, 0x2761, 0x2762, 0x2763, 0x2764, 0x2765, 0x2766, 0x2767, 0x277f,
1152   0x2789, 0x2793, 0x2794, 0x2798, 0x2799, 0x279a, 0x279b, 0x279c, 0x279d, 0x279e,
1153   0x279f, 0x27a0, 0x27a1, 0x27a2, 0x27a3, 0x27a4, 0x27a5, 0x27a6, 0x27a7, 0x27a8,
1154   0x27a9, 0x27aa, 0x27ab, 0x27ac, 0x27ad, 0x27ae, 0x27af, 0x27b1, 0x27b2, 0x27b3,
1155   0x27b4, 0x27b5, 0x27b6, 0x27b7, 0x27b8, 0x27b9, 0x27ba, 0x27bb, 0x27bc, 0x27bd,
1156   0x27be, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c,
1157   0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3012, 0x3014, 0x3015, 0x3016, 0x3017,
1158   0x3018, 0x3019, 0x301a, 0x301b, 0x3036,
1159   0xf6d9, 0xf6da, 0xf6db, 0xf8d7, 0xf8d8, 0xf8d9, 0xf8da, 0xf8db, 0xf8dc,
1160   0xf8dd, 0xf8de, 0xf8df, 0xf8e0, 0xf8e1, 0xf8e2, 0xf8e3, 0xf8e4, 0xf8e5, 0xf8e6,
1161   0xf8e7, 0xf8e8, 0xf8e9, 0xf8ea, 0xf8eb, 0xf8ec, 0xf8ed, 0xf8ee, 0xf8ef, 0xf8f0,
1162   0xf8f1, 0xf8f2, 0xf8f3, 0xf8f4, 0xf8f5, 0xf8f6, 0xf8f7, 0xf8f8, 0xf8f9, 0xf8fa,
1163   0xf8fb, 0xf8fc, 0xf8fd, 0xf8fe, 0xfe7c, 0xfe7d,
1164           0xff01, 0xff02, 0xff03, 0xff04, 0xff05, 0xff06, 0xff07, 0xff08, 0xff09,
1165   0xff09, 0xff0a, 0xff0b, 0xff0c, 0xff0d, 0xff0e, 0xff0f, 0xff1a, 0xff1b, 0xff1c,
1166   0xff1d, 0xff1e, 0xff1f, 0xff20, 0xff3b, 0xff3c, 0xff3d, 0xff3e, 0xff40, 0xff5b,
1167   0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65,
1168   0xffe0, 0xffe1, 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe8, 0xffe9, 0xffea,
1169   0xffeb, 0xffec, 0xffed, 0xffee,
1170 );
1171
1172 // utf8 version of above data
1173 global $UTF8_SPECIAL_CHARS2;
1174 $UTF8_SPECIAL_CHARS2 =
1175     "\x1A".'\e\1c\1d\1e\1f !"#$%&\'()+,/;<=>?@[\]^`{|}~\7f\80\81\82\83\84\85\86\87\88\89\8a\8b\8c\8d\8e\8f\90\91\92\93\94\95�'.
1176     '�\97\98\99\9a\9b\9c\9d\9e\9f ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½�'.
1177     '�¿×÷ˇ˘˙˚˛˜˝̣̀́̃̉΄΅·βφϑϒϕϖְֱֲֳִֵֶַָֹֻּֽ־ֿ�'.
1178     '�ׁׂ׃׳״،؛؟ـًٌٍَُِّْ٪฿‌‍‎‏–—―‗‘’‚“”�'.
1179     '��†‡•…‰′″‹›⁄₧₪₫€№℘™Ωℵ←↑→↓↔↕↵'.
1180     '⇐⇑⇒⇓⇔∀∂∃∅∆∇∈∉∋∏∑−∕∗∙√∝∞∠∧∨�'.
1181     '�∪∫∴∼≅≈≠≡≤≥⊂⊃⊄⊆⊇⊕⊗⊥⋅⌐⌠⌡〈〉⑩─�'.
1182     '��┌┐└┘├┤┬┴┼═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠'.
1183     '╡╢╣╤╥╦╧╨╩╪╫╬▀▄█▌▐░▒▓■▲▼◆◊●�'.
1184     '�★☎☛☞♠♣♥♦✁✂✃✄✆✇✈✉✌✍✎✏✐✑✒✓✔✕�'.
1185     '��✗✘✙✚✛✜✝✞✟✠✡✢✣✤✥✦✧✩✪✫✬✭✮✯✰✱'.
1186     '✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀❁❂❃❄❅❆❇❈❉❊❋�'.
1187     '�❏❐❑❒❖❘❙❚❛❜❝❞❡❢❣❤❥❦❧❿➉➓➔➘➙➚�'.
1188     '��➜➝➞➟➠➡➢➣➤➥➦➧➨➩➪➫➬➭➮➯➱➲➳➴➵➶'.
1189     '➷➸➹➺➻➼➽➾'.
1190     '　、。〃〈〉《》「」『』【】〒〔〕〖〗〘〙〚〛〶'.
1191     '�'.
1192     '�ﹼﹽ'.
1193     '！＂＃＄％＆＇（）＊＋，－．／：；＜＝＞？＠［＼］＾｀｛｜｝～'.
1194     '｟｠｡｢｣､･￠￡￢￣￤￥￦￨￩￪￫￬￭￮';
1195
1196 /**
1197  * Romanization lookup table
1198  *
1199  * This lookup tables provides a way to transform strings written in a language
1200  * different from the ones based upon latin letters into plain ASCII.
1201  *
1202  * Please note: this is not a scientific transliteration table. It only works
1203  * oneway from nonlatin to ASCII and it works by simple character replacement
1204  * only. Specialities of each language are not supported.
1205  *
1206  * @author Andreas Gohr <andi@splitbrain.org>
1207  * @author Vitaly Blokhin <vitinfo@vitn.com>
1208  * @link   http://www.uconv.com/translit.htm
1209  * @author Bisqwit <bisqwit@iki.fi>
1210  * @link   http://kanjidict.stc.cx/hiragana.php?src=2
1211  * @link   http://www.translatum.gr/converter/greek-transliteration.htm
1212  * @link   http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
1213  * @link   http://www.btranslations.com/resources/romanization/korean.asp
1214  */
1215 global $UTF8_ROMANIZATION;
1216 $UTF8_ROMANIZATION = array(
1217   //russian cyrillic
1218   'а'=>'a','А'=>'A','б'=>'b','Б'=>'B','в'=>'v','В'=>'V','г'=>'g','Г'=>'G',
1219   'д'=>'d','Д'=>'D','е'=>'e','Е'=>'E','ё'=>'jo','Ё'=>'Jo','ж'=>'zh','Ж'=>'Zh',
1220   'з'=>'z','З'=>'Z','и'=>'i','И'=>'I','й'=>'j','Й'=>'J','к'=>'k','К'=>'K',
1221   'л'=>'l','Л'=>'L','м'=>'m','М'=>'M','н'=>'n','Н'=>'N','о'=>'o','О'=>'O',
1222   'п'=>'p','П'=>'P','р'=>'r','Р'=>'R','с'=>'s','С'=>'S','т'=>'t','Т'=>'T',
1223   'у'=>'u','У'=>'U','ф'=>'f','Ф'=>'F','х'=>'x','Х'=>'X','ц'=>'c','Ц'=>'C',
1224   'ч'=>'ch','Ч'=>'Ch','ш'=>'sh','Ш'=>'Sh','щ'=>'sch','Щ'=>'Sch','ъ'=>'',
1225   'Ъ'=>'','ы'=>'y','Ы'=>'Y','ь'=>'','Ь'=>'','э'=>'eh','Э'=>'Eh','ю'=>'ju',
1226   'Ю'=>'Ju','я'=>'ja','Я'=>'Ja',
1227   // Ukrainian cyrillic
1228   'Ґ'=>'Gh','ґ'=>'gh','Є'=>'Je','є'=>'je','І'=>'I','і'=>'i','Ї'=>'Ji','ї'=>'ji',
1229   // Georgian
1230   'ა'=>'a','ბ'=>'b','გ'=>'g','დ'=>'d','ე'=>'e','ვ'=>'v','ზ'=>'z','თ'=>'th',
1231   'ი'=>'i','კ'=>'p','ლ'=>'l','მ'=>'m','ნ'=>'n','ო'=>'o','პ'=>'p','ჟ'=>'zh',
1232   'რ'=>'r','ს'=>'s','ტ'=>'t','უ'=>'u','ფ'=>'ph','ქ'=>'kh','ღ'=>'gh','ყ'=>'q',
1233   'შ'=>'sh','ჩ'=>'ch','ც'=>'c','ძ'=>'dh','წ'=>'w','ჭ'=>'j','ხ'=>'x','ჯ'=>'jh',
1234   'ჰ'=>'xh',
1235   //Sanskrit
1236   'अ'=>'a','आ'=>'ah','इ'=>'i','ई'=>'ih','उ'=>'u','ऊ'=>'uh','ऋ'=>'ry',
1237   'ॠ'=>'ryh','ऌ'=>'ly','ॡ'=>'lyh','ए'=>'e','ऐ'=>'ay','ओ'=>'o','औ'=>'aw',
1238   'अं'=>'amh','अः'=>'aq','क'=>'k','ख'=>'kh','ग'=>'g','घ'=>'gh','ङ'=>'nh',
1239   'च'=>'c','छ'=>'ch','ज'=>'j','झ'=>'jh','ञ'=>'ny','ट'=>'tq','ठ'=>'tqh',
1240   'ड'=>'dq','ढ'=>'dqh','ण'=>'nq','त'=>'t','थ'=>'th','द'=>'d','ध'=>'dh',
1241   'न'=>'n','प'=>'p','फ'=>'ph','ब'=>'b','भ'=>'bh','म'=>'m','य'=>'z','र'=>'r',
1242   'ल'=>'l','व'=>'v','श'=>'sh','ष'=>'sqh','स'=>'s','ह'=>'x',
1243   //Hebrew
1244   'א'=>'a', 'ב'=>'b','ג'=>'g','ד'=>'d','ה'=>'h','ו'=>'v','ז'=>'z','ח'=>'kh','ט'=>'th',
1245   'י'=>'y','ך'=>'h','כ'=>'k','ל'=>'l','ם'=>'m','מ'=>'m','ן'=>'n','נ'=>'n',
1246   'ס'=>'s','ע'=>'ah','ף'=>'f','פ'=>'p','ץ'=>'c','צ'=>'c','ק'=>'q','ר'=>'r',
1247   'ש'=>'sh','ת'=>'t',
1248   //Arabic
1249   'ا'=>'a','ب'=>'b','ت'=>'t','ث'=>'th','ج'=>'g','ح'=>'xh','خ'=>'x','د'=>'d',
1250   'ذ'=>'dh','ر'=>'r','ز'=>'z','س'=>'s','ش'=>'sh','ص'=>'s\'','ض'=>'d\'',
1251   'ط'=>'t\'','ظ'=>'z\'','ع'=>'y','غ'=>'gh','ف'=>'f','ق'=>'q','ك'=>'k',
1252   'ل'=>'l','م'=>'m','ن'=>'n','ه'=>'x\'','و'=>'u','ي'=>'i',
1253
1254   // Japanese hiragana
1255   'あ'=>'a','え'=>'e','い'=>'i','お'=>'o','う'=>'u','ば'=>'ba','べ'=>'be',
1256   'び'=>'bi','ぼ'=>'bo','ぶ'=>'bu','し'=>'ci','だ'=>'da','で'=>'de','ぢ'=>'di',
1257   'ど'=>'do','づ'=>'du','ふぁ'=>'fa','ふぇ'=>'fe','ふぃ'=>'fi','ふぉ'=>'fo',
1258   'ふ'=>'fu','が'=>'ga','げ'=>'ge','ぎ'=>'gi','ご'=>'go','ぐ'=>'gu','は'=>'ha',
1259   'へ'=>'he','ひ'=>'hi','ほ'=>'ho','ふ'=>'hu','じゃ'=>'ja','じぇ'=>'je',
1260   'じ'=>'ji','じょ'=>'jo','じゅ'=>'ju','か'=>'ka','け'=>'ke','き'=>'ki',
1261   'こ'=>'ko','く'=>'ku','ら'=>'la','れ'=>'le','り'=>'li','ろ'=>'lo','る'=>'lu',
1262   'ま'=>'ma','め'=>'me','み'=>'mi','も'=>'mo','む'=>'mu','な'=>'na','ね'=>'ne',
1263   'に'=>'ni','の'=>'no','ぬ'=>'nu','ぱ'=>'pa','ぺ'=>'pe','ぴ'=>'pi','ぽ'=>'po',
1264   'ぷ'=>'pu','ら'=>'ra','れ'=>'re','り'=>'ri','ろ'=>'ro','る'=>'ru','さ'=>'sa',
1265   'せ'=>'se','し'=>'si','そ'=>'so','す'=>'su','た'=>'ta','て'=>'te','ち'=>'ti',
1266   'と'=>'to','つ'=>'tu','ヴぁ'=>'va','ヴぇ'=>'ve','ヴぃ'=>'vi','ヴぉ'=>'vo',
1267   'ヴ'=>'vu','わ'=>'wa','うぇ'=>'we','うぃ'=>'wi','を'=>'wo','や'=>'ya','いぇ'=>'ye',
1268   'い'=>'yi','よ'=>'yo','ゆ'=>'yu','ざ'=>'za','ぜ'=>'ze','じ'=>'zi','ぞ'=>'zo',
1269   'ず'=>'zu','びゃ'=>'bya','びぇ'=>'bye','びぃ'=>'byi','びょ'=>'byo','びゅ'=>'byu',
1270   'ちゃ'=>'cha','ちぇ'=>'che','ち'=>'chi','ちょ'=>'cho','ちゅ'=>'chu','ちゃ'=>'cya',
1271   'ちぇ'=>'cye','ちぃ'=>'cyi','ちょ'=>'cyo','ちゅ'=>'cyu','でゃ'=>'dha','でぇ'=>'dhe',
1272   'でぃ'=>'dhi','でょ'=>'dho','でゅ'=>'dhu','どぁ'=>'dwa','どぇ'=>'dwe','どぃ'=>'dwi',
1273   'どぉ'=>'dwo','どぅ'=>'dwu','ぢゃ'=>'dya','ぢぇ'=>'dye','ぢぃ'=>'dyi','ぢょ'=>'dyo',
1274   'ぢゅ'=>'dyu','ぢ'=>'dzi','ふぁ'=>'fwa','ふぇ'=>'fwe','ふぃ'=>'fwi','ふぉ'=>'fwo',
1275   'ふぅ'=>'fwu','ふゃ'=>'fya','ふぇ'=>'fye','ふぃ'=>'fyi','ふょ'=>'fyo','ふゅ'=>'fyu',
1276   'ぎゃ'=>'gya','ぎぇ'=>'gye','ぎぃ'=>'gyi','ぎょ'=>'gyo','ぎゅ'=>'gyu','ひゃ'=>'hya',
1277   'ひぇ'=>'hye','ひぃ'=>'hyi','ひょ'=>'hyo','ひゅ'=>'hyu','じゃ'=>'jya','じぇ'=>'jye',
1278   'じぃ'=>'jyi','じょ'=>'jyo','じゅ'=>'jyu','きゃ'=>'kya','きぇ'=>'kye','きぃ'=>'kyi',
1279   'きょ'=>'kyo','きゅ'=>'kyu','りゃ'=>'lya','りぇ'=>'lye','りぃ'=>'lyi','りょ'=>'lyo',
1280   'りゅ'=>'lyu','みゃ'=>'mya','みぇ'=>'mye','みぃ'=>'myi','みょ'=>'myo','みゅ'=>'myu',
1281   'ん'=>'n','にゃ'=>'nya','にぇ'=>'nye','にぃ'=>'nyi','にょ'=>'nyo','にゅ'=>'nyu',
1282   'ぴゃ'=>'pya','ぴぇ'=>'pye','ぴぃ'=>'pyi','ぴょ'=>'pyo','ぴゅ'=>'pyu','りゃ'=>'rya',
1283   'りぇ'=>'rye','りぃ'=>'ryi','りょ'=>'ryo','りゅ'=>'ryu','しゃ'=>'sha','しぇ'=>'she',
1284   'し'=>'shi','しょ'=>'sho','しゅ'=>'shu','すぁ'=>'swa','すぇ'=>'swe','すぃ'=>'swi',
1285   'すぉ'=>'swo','すぅ'=>'swu','しゃ'=>'sya','しぇ'=>'sye','しぃ'=>'syi','しょ'=>'syo',
1286   'しゅ'=>'syu','てゃ'=>'tha','てぇ'=>'the','てぃ'=>'thi','てょ'=>'tho','てゅ'=>'thu',
1287   'つゃ'=>'tsa','つぇ'=>'tse','つぃ'=>'tsi','つょ'=>'tso','つ'=>'tsu','とぁ'=>'twa',
1288   'とぇ'=>'twe','とぃ'=>'twi','とぉ'=>'two','とぅ'=>'twu','ちゃ'=>'tya','ちぇ'=>'tye',
1289   'ちぃ'=>'tyi','ちょ'=>'tyo','ちゅ'=>'tyu','ヴゃ'=>'vya','ヴぇ'=>'vye','ヴぃ'=>'vyi',
1290   'ヴょ'=>'vyo','ヴゅ'=>'vyu','うぁ'=>'wha','うぇ'=>'whe','うぃ'=>'whi','うぉ'=>'who',
1291   'うぅ'=>'whu','ゑ'=>'wye','ゐ'=>'wyi','じゃ'=>'zha','じぇ'=>'zhe','じぃ'=>'zhi',
1292   'じょ'=>'zho','じゅ'=>'zhu','じゃ'=>'zya','じぇ'=>'zye','じぃ'=>'zyi','じょ'=>'zyo',
1293   'じゅ'=>'zyu',
1294   // Japanese katakana
1295   'ア'=>'a','エ'=>'e','イ'=>'i','オ'=>'o','ウ'=>'u','バ'=>'ba','ベ'=>'be','ビ'=>'bi',
1296   'ボ'=>'bo','ブ'=>'bu','シ'=>'ci','ダ'=>'da','デ'=>'de','ヂ'=>'di','ド'=>'do',
1297   'ヅ'=>'du','ファ'=>'fa','フェ'=>'fe','フィ'=>'fi','フォ'=>'fo','フ'=>'fu','ガ'=>'ga',
1298   'ゲ'=>'ge','ギ'=>'gi','ゴ'=>'go','グ'=>'gu','ハ'=>'ha','ヘ'=>'he','ヒ'=>'hi','ホ'=>'ho',
1299   'フ'=>'hu','ジャ'=>'ja','ジェ'=>'je','ジ'=>'ji','ジョ'=>'jo','ジュ'=>'ju','カ'=>'ka',
1300   'ケ'=>'ke','キ'=>'ki','コ'=>'ko','ク'=>'ku','ラ'=>'la','レ'=>'le','リ'=>'li','ロ'=>'lo',
1301   'ル'=>'lu','マ'=>'ma','メ'=>'me','ミ'=>'mi','モ'=>'mo','ム'=>'mu','ナ'=>'na','ネ'=>'ne',
1302   'ニ'=>'ni','ノ'=>'no','ヌ'=>'nu','パ'=>'pa','ペ'=>'pe','ピ'=>'pi','ポ'=>'po','プ'=>'pu',
1303   'ラ'=>'ra','レ'=>'re','リ'=>'ri','ロ'=>'ro','ル'=>'ru','サ'=>'sa','セ'=>'se','シ'=>'si',
1304   'ソ'=>'so','ス'=>'su','タ'=>'ta','テ'=>'te','チ'=>'ti','ト'=>'to','ツ'=>'tu','ヴァ'=>'va',
1305   'ヴェ'=>'ve','ヴィ'=>'vi','ヴォ'=>'vo','ヴ'=>'vu','ワ'=>'wa','ウェ'=>'we','ウィ'=>'wi',
1306   'ヲ'=>'wo','ヤ'=>'ya','イェ'=>'ye','イ'=>'yi','ヨ'=>'yo','ユ'=>'yu','ザ'=>'za','ゼ'=>'ze',
1307   'ジ'=>'zi','ゾ'=>'zo','ズ'=>'zu','ビャ'=>'bya','ビェ'=>'bye','ビィ'=>'byi','ビョ'=>'byo',
1308   'ビュ'=>'byu','チャ'=>'cha','チェ'=>'che','チ'=>'chi','チョ'=>'cho','チュ'=>'chu',
1309   'チャ'=>'cya','チェ'=>'cye','チィ'=>'cyi','チョ'=>'cyo','チュ'=>'cyu','デャ'=>'dha',
1310   'デェ'=>'dhe','ディ'=>'dhi','デョ'=>'dho','デュ'=>'dhu','ドァ'=>'dwa','ドェ'=>'dwe',
1311   'ドィ'=>'dwi','ドォ'=>'dwo','ドゥ'=>'dwu','ヂャ'=>'dya','ヂェ'=>'dye','ヂィ'=>'dyi',
1312   'ヂョ'=>'dyo','ヂュ'=>'dyu','ヂ'=>'dzi','ファ'=>'fwa','フェ'=>'fwe','フィ'=>'fwi',
1313   'フォ'=>'fwo','フゥ'=>'fwu','フャ'=>'fya','フェ'=>'fye','フィ'=>'fyi','フョ'=>'fyo',
1314   'フュ'=>'fyu','ギャ'=>'gya','ギェ'=>'gye','ギィ'=>'gyi','ギョ'=>'gyo','ギュ'=>'gyu',
1315   'ヒャ'=>'hya','ヒェ'=>'hye','ヒィ'=>'hyi','ヒョ'=>'hyo','ヒュ'=>'hyu','ジャ'=>'jya',
1316   'ジェ'=>'jye','ジィ'=>'jyi','ジョ'=>'jyo','ジュ'=>'jyu','キャ'=>'kya','キェ'=>'kye',
1317   'キィ'=>'kyi','キョ'=>'kyo','キュ'=>'kyu','リャ'=>'lya','リェ'=>'lye','リィ'=>'lyi',
1318   'リョ'=>'lyo','リュ'=>'lyu','ミャ'=>'mya','ミェ'=>'mye','ミィ'=>'myi','ミョ'=>'myo',
1319   'ミュ'=>'myu','ン'=>'n','ニャ'=>'nya','ニェ'=>'nye','ニィ'=>'nyi','ニョ'=>'nyo',
1320   'ニュ'=>'nyu','ピャ'=>'pya','ピェ'=>'pye','ピィ'=>'pyi','ピョ'=>'pyo','ピュ'=>'pyu',
1321   'リャ'=>'rya','リェ'=>'rye','リィ'=>'ryi','リョ'=>'ryo','リュ'=>'ryu','シャ'=>'sha',
1322   'シェ'=>'she','シ'=>'shi','ショ'=>'sho','シュ'=>'shu','スァ'=>'swa','スェ'=>'swe',
1323   'スィ'=>'swi','スォ'=>'swo','スゥ'=>'swu','シャ'=>'sya','シェ'=>'sye','シィ'=>'syi',
1324   'ショ'=>'syo','シュ'=>'syu','テャ'=>'tha','テェ'=>'the','ティ'=>'thi','テョ'=>'tho',
1325   'テュ'=>'thu','ツャ'=>'tsa','ツェ'=>'tse','ツィ'=>'tsi','ツョ'=>'tso','ツ'=>'tsu',
1326   'トァ'=>'twa','トェ'=>'twe','トィ'=>'twi','トォ'=>'two','トゥ'=>'twu','チャ'=>'tya',
1327   'チェ'=>'tye','チィ'=>'tyi','チョ'=>'tyo','チュ'=>'tyu','ヴャ'=>'vya','ヴェ'=>'vye',
1328   'ヴィ'=>'vyi','ヴョ'=>'vyo','ヴュ'=>'vyu','ウァ'=>'wha','ウェ'=>'whe','ウィ'=>'whi',
1329   'ウォ'=>'who','ウゥ'=>'whu','ヱ'=>'wye','ヰ'=>'wyi','ジャ'=>'zha','ジェ'=>'zhe',
1330   'ジィ'=>'zhi','ジョ'=>'zho','ジュ'=>'zhu','ジャ'=>'zya','ジェ'=>'zye','ジィ'=>'zyi',
1331   'ジョ'=>'zyo','ジュ'=>'zyu',
1332
1333   // "Greeklish"
1334   'Γ'=>'G','Δ'=>'E','Θ'=>'Th','Λ'=>'L','Ξ'=>'X','Π'=>'P','Σ'=>'S','Φ'=>'F','Ψ'=>'Ps',
1335   'γ'=>'g','δ'=>'e','θ'=>'th','λ'=>'l','ξ'=>'x','π'=>'p','σ'=>'s','φ'=>'f','ψ'=>'ps',
1336
1337   // Thai
1338   'ก'=>'k','ข'=>'kh','ฃ'=>'kh','ค'=>'kh','ฅ'=>'kh','ฆ'=>'kh','ง'=>'ng','จ'=>'ch',
1339   'ฉ'=>'ch','ช'=>'ch','ซ'=>'s','ฌ'=>'ch','ญ'=>'y','ฎ'=>'d','ฏ'=>'t','ฐ'=>'th',
1340   'ฑ'=>'d','ฒ'=>'th','ณ'=>'n','ด'=>'d','ต'=>'t','ถ'=>'th','ท'=>'th','ธ'=>'th',
1341   'น'=>'n','บ'=>'b','ป'=>'p','ผ'=>'ph','ฝ'=>'f','พ'=>'ph','ฟ'=>'f','ภ'=>'ph',
1342   'ม'=>'m','ย'=>'y','ร'=>'r','ฤ'=>'rue','ฤๅ'=>'rue','ล'=>'l','ฦ'=>'lue',
1343   'ฦๅ'=>'lue','ว'=>'w','ศ'=>'s','ษ'=>'s','ส'=>'s','ห'=>'h','ฬ'=>'l','ฮ'=>'h',
1344   'ะ'=>'a','–ั'=>'a','รร'=>'a','า'=>'a','รร'=>'an','ำ'=>'am','–ิ'=>'i','–ี'=>'i',
1345   '–ึ'=>'ue','–ื'=>'ue','–ุ'=>'u','–ู'=>'u','เะ'=>'e','เ–็'=>'e','เ'=>'e','แะ'=>'ae',
1346   'แ'=>'ae','โะ'=>'o','โ'=>'o','เาะ'=>'o','อ'=>'o','เอะ'=>'oe','เ–ิ'=>'oe',
1347   'เอ'=>'oe','เ–ียะ'=>'ia','เ–ีย'=>'ia','เ–ือะ'=>'uea','เ–ือ'=>'uea','–ัวะ'=>'ua',
1348   '–ัว'=>'ua','ว'=>'ua','ใ'=>'ai','ไ'=>'ai','–ัย'=>'ai','ไย'=>'ai','าย'=>'ai',
1349   'เา'=>'ao','าว'=>'ao','–ุย'=>'ui','โย'=>'oi','อย'=>'oi','เย'=>'oei','เ–ือย'=>'ueai',
1350   'วย'=>'uai','–ิว'=>'io','เ–็ว'=>'eo','เว'=>'eo','แ–็ว'=>'aeo','แว'=>'aeo',
1351   'เ–ียว'=>'iao',
1352
1353   // Korean
1354   'ㄱ'=>'k','ㅋ'=>'kh','ㄲ'=>'kk','ㄷ'=>'t','ㅌ'=>'th','ㄸ'=>'tt','ㅂ'=>'p',
1355   'ㅍ'=>'ph','ㅃ'=>'pp','ㅈ'=>'c','ㅊ'=>'ch','ㅉ'=>'cc','ㅅ'=>'s','ㅆ'=>'ss',
1356   'ㅎ'=>'h','ㅇ'=>'ng','ㄴ'=>'n','ㄹ'=>'l','ㅁ'=>'m', 'ㅏ'=>'a','ㅓ'=>'e','ㅗ'=>'o',
1357   'ㅜ'=>'wu','ㅡ'=>'u','ㅣ'=>'i','ㅐ'=>'ay','ㅔ'=>'ey','ㅚ'=>'oy','ㅘ'=>'wa','ㅝ'=>'we',
1358   'ㅟ'=>'wi','ㅙ'=>'way','ㅞ'=>'wey','ㅢ'=>'uy','ㅑ'=>'ya','ㅕ'=>'ye','ㅛ'=>'oy',
1359   'ㅠ'=>'yu','ㅒ'=>'yay','ㅖ'=>'yey',
1360 );
1361
1362 //Setup VIM: ex: et ts=2 enc=utf-8 :
1363