Вход Регистрация
Файл: forsoc.ru/includes/utf/utf_normalizer.php
Строк: 1063
<?php
/**
*
* This file is part of the phpBB Forum Software package.
*
* @copyright (c) phpBB Limited <https://www.phpbb.com>
* @license GNU General Public License, version 2 (GPL-2.0)
*
* For full copyright and license information, please see
* the docs/CREDITS.txt file.
*
*/

/**
*/
if (!defined('IN_PHPBB'))
{
    exit;
}

/**
* Some Unicode characters encoded in UTF-8
*
* Preserved for compatibility
*/
define('UTF8_REPLACEMENT'"xEFxBFxBD");
define('UTF8_MAX'"xF4x8FxBFxBF");
define('UTF8_FFFE'"xEFxBFxBE");
define('UTF8_FFFF'"xEFxBFxBF");
define('UTF8_SURROGATE_FIRST'"xEDxA0x80");
define('UTF8_SURROGATE_LAST'"xEDxBFxBF");
define('UTF8_HANGUL_FIRST'"xEAxB0x80");
define('UTF8_HANGUL_LAST'"xEDx9ExA3");

define('UTF8_CJK_FIRST'"xE4xB8x80");
define('UTF8_CJK_LAST'"xE9xBExBB");
define('UTF8_CJK_B_FIRST'"xF0xA0x80x80");
define('UTF8_CJK_B_LAST'"xF0xAAx9Bx96");

// Unset global variables
unset($GLOBALS['utf_jamo_index'], $GLOBALS['utf_jamo_type'], $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_combining_class'], $GLOBALS['utf_canonical_comp'], $GLOBALS['utf_canonical_decomp'], $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);

// NFC_QC and NFKC_QC values
define('UNICODE_QC_MAYBE'0);
define('UNICODE_QC_NO'1);

// Contains all the ASCII characters appearing in UTF-8, sorted by frequency
define('UTF8_ASCII_RANGE'"x20x65x69x61x73x6Ex74x72x6Fx6Cx75x64x5Dx5Bx63x6Dx70x27x0Ax67x7Cx68x76x2Ex66x62x2Cx3Ax3Dx2Dx71x31x30x43x32x2Ax79x78x29x28x4Cx39x41x53x2Fx50x22x45x6Ax4Dx49x6Bx33x3Ex35x54x3Cx44x34x7Dx42x7Bx38x46x77x52x36x37x55x47x4Ex3Bx4Ax7Ax56x23x48x4Fx57x5Fx26x21x4Bx3Fx58x51x25x59x5Cx09x5Ax2Bx7Ex5Ex24x40x60x7Fx00x01x02x03x04x05x06x07x08x0Bx0Cx0Dx0Ex0Fx10x11x12x13x14x15x16x17x18x19x1Ax1Bx1Cx1Dx1Ex1F");

// Contains all the tail bytes that can appear in the composition of a UTF-8 char
define('UTF8_TRAILING_BYTES'"xA9xA0xA8x80xAAx99xA7xBBxABx89x94x82xB4xA2xAEx83xB0xB9xB8x93xAFxBCxB3x81xA4xB2x9CxA1xB5xBExBDxBAx98xADxB1x84x95xA6xB6x88x8Dx90xB7xBFx92x85xA5x97x8Cx86xA3x8Ex9Fx8Fx87x91x9DxACx9Ex8Bx96x9Bx8Ax9A");

// Constants used by the Hangul [de]composition algorithms
define('UNICODE_HANGUL_SBASE'0xAC00);
define('UNICODE_HANGUL_LBASE'0x1100);
define('UNICODE_HANGUL_VBASE'0x1161);
define('UNICODE_HANGUL_TBASE'0x11A7);
define('UNICODE_HANGUL_SCOUNT'11172);
define('UNICODE_HANGUL_LCOUNT'19);
define('UNICODE_HANGUL_VCOUNT'21);
define('UNICODE_HANGUL_TCOUNT'28);
define('UNICODE_HANGUL_NCOUNT'588);
define('UNICODE_JAMO_L'0);
define('UNICODE_JAMO_V'1);
define('UNICODE_JAMO_T'2);

/**
* Unicode normalization routines
*/
class utf_normalizer
{
    
/**
    * Validate, cleanup and normalize a string
    *
    * The ultimate convenience function! Clean up invalid UTF-8 sequences,
    * and convert to Normal Form C, canonical composition.
    *
    * @param    string    &$str    The dirty string
    * @return    string            The same string, all shiny and cleaned-up
    */
    
static function cleanup(&$str)
    {
        
// The string below is the list of all autorized characters, sorted by frequency in latin text
        
$pos strspn($str"x20x65x69x61x73x6Ex74x72x6Fx6Cx75x64x5Dx5Bx63x6Dx70x27x0Ax67x7Cx68x76x2Ex66x62x2Cx3Ax3Dx2Dx71x31x30x43x32x2Ax79x78x29x28x4Cx39x41x53x2Fx50x22x45x6Ax4Dx49x6Bx33x3Ex35x54x3Cx44x34x7Dx42x7Bx38x46x77x52x36x37x55x47x4Ex3Bx4Ax7Ax56x23x48x4Fx57x5Fx26x21x4Bx3Fx58x51x25x59x5Cx09x5Ax2Bx7Ex5Ex24x40x60x7Fx0D");
        
$len strlen($str);

        if (
$pos == $len)
        {
            
// ASCII strings with no special chars return immediately
            
return;
        }

        
// Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
        
if (!isset($GLOBALS['utf_nfc_qc']))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_nfc_qc.' $phpEx);
        }

        if (!isset(
$GLOBALS['utf_canonical_decomp']))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_canonical_decomp.' $phpEx);
        }

        
// Replace any byte in the range 0x00..0x1F, except for r, n and t
        // We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
        
$str strtr(
            
$str,
            
"x00x01x02x03x04x05x06x07x08x0Bx0Cx0Ex0Fx10x11x12x13x14x15x16x17x18x19x1Ax1Bx1Cx1Dx1Ex1F",
            
"xFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFFxFF"
        
);

        
$str utf_normalizer::recompose($str$pos$len$GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
    }

    
/**
    * Validate and normalize a UTF string to NFC
    *
    * @param    string    &$str    Unchecked UTF string
    * @return    string            The string, validated and in normal form
    */
    
static function nfc(&$str)
    {
        
$pos strspn($strUTF8_ASCII_RANGE);
        
$len strlen($str);

        if (
$pos == $len)
        {
            
// ASCII strings return immediately
            
return;
        }

        if (!isset(
$GLOBALS['utf_nfc_qc']))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_nfc_qc.' $phpEx);
        }

        if (!isset(
$GLOBALS['utf_canonical_decomp']))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_canonical_decomp.' $phpEx);
        }

        
$str utf_normalizer::recompose($str$pos$len$GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
    }

    
/**
    * Validate and normalize a UTF string to NFKC
    *
    * @param    string    &$str    Unchecked UTF string
    * @return    string            The string, validated and in normal form
    */
    
static function nfkc(&$str)
    {
        
$pos strspn($strUTF8_ASCII_RANGE);
        
$len strlen($str);

        if (
$pos == $len)
        {
            
// ASCII strings return immediately
            
return;
        }

        if (!isset(
$GLOBALS['utf_nfkc_qc']))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_nfkc_qc.' $phpEx);
        }

        if (!isset(
$GLOBALS['utf_compatibility_decomp']))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_compatibility_decomp.' $phpEx);
        }

        
$str utf_normalizer::recompose($str$pos$len$GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
    }

    
/**
    * Validate and normalize a UTF string to NFD
    *
    * @param    string    &$str    Unchecked UTF string
    * @return    string            The string, validated and in normal form
    */
    
static function nfd(&$str)
    {
        
$pos strspn($strUTF8_ASCII_RANGE);
        
$len strlen($str);

        if (
$pos == $len)
        {
            
// ASCII strings return immediately
            
return;
        }

        if (!isset(
$GLOBALS['utf_canonical_decomp']))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_canonical_decomp.' $phpEx);
        }

        
$str utf_normalizer::decompose($str$pos$len$GLOBALS['utf_canonical_decomp']);
    }

    
/**
    * Validate and normalize a UTF string to NFKD
    *
    * @param    string    &$str    Unchecked UTF string
    * @return    string            The string, validated and in normal form
    */
    
static function nfkd(&$str)
    {
        
$pos strspn($strUTF8_ASCII_RANGE);
        
$len strlen($str);

        if (
$pos == $len)
        {
            
// ASCII strings return immediately
            
return;
        }

        if (!isset(
$GLOBALS['utf_compatibility_decomp']))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_compatibility_decomp.' $phpEx);
        }

        
$str utf_normalizer::decompose($str$pos$len$GLOBALS['utf_compatibility_decomp']);
    }


    
/**
    * Recompose a UTF string
    *
    * @param    string    $str            Unchecked UTF string
    * @param    integer    $pos            Position of the first UTF char (in bytes)
    * @param    integer    $len            Length of the string (in bytes)
    * @param    array    &$qc            Quick-check array, passed by reference but never modified
    * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified
    * @return    string                    The string, validated and recomposed
    *
    * @access    private
    */
    
static function recompose($str$pos$len, &$qc, &$decomp_map)
    {
        global 
$utf_combining_class$utf_canonical_comp$utf_jamo_type$utf_jamo_index;

        
// Load some commonly-used tables
        
if (!isset($utf_jamo_index$utf_jamo_type$utf_combining_class))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_normalizer_common.' $phpEx);
        }

        
// Load the canonical composition table
        
if (!isset($utf_canonical_comp))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_canonical_comp.' $phpEx);
        }

        
// Buffer the last ASCII char before the UTF-8 stuff if applicable
        
$tmp '';
        
$i $tmp_pos $last_cc 0;

        
$buffer = ($pos) ? array(++$i => $str[$pos 1]) : array();

        
// UTF char length array
        // This array is used to determine the length of a UTF character.
        // Be $c the result of ($str[$pos] & "xF0") --where $str is the string we're operating on and $pos
        // the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
        // Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
        // whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
        
$utf_len_mask = array(
            
// Leading bytes masks
            
"xC0" => 2"xD0" => 2"xE0" => 3"xF0" => 4,
            
// Trailing bytes masks
            
"x80" => 0"x90" => 0"xA0" => 0"xB0" => 0
        
);

        
$extra_check = array(
            
"xED" => 1"xEF" => 1"xC0" => 1"xC1" => 1"xE0" => 1"xF0" => 1,
            
"xF4" => 1"xF5" => 1"xF6" => 1"xF7" => 1"xF8" => 1"xF9" => 1,
            
"xFA" => 1"xFB" => 1"xFC" => 1"xFD" => 1"xFE" => 1"xFF" => 1
        
);

        
$utf_validation_mask = array(
            
2    => "xE0xC0",
            
3    => "xF0xC0xC0",
            
4    => "xF8xC0xC0xC0"
        
);

        
$utf_validation_check = array(
            
2    => "xC0x80",
            
3    => "xE0x80x80",
            
4    => "xF0x80x80x80"
        
);

        
// Main loop
        
do
        {
            
// STEP 0: Capture the current char and buffer it
            
$c $str[$pos];
            
$c_mask $c "xF0";

            if (isset(
$utf_len_mask[$c_mask]))
            {
                
// Byte at $pos is either a leading byte or a missplaced trailing byte
                
if ($utf_len $utf_len_mask[$c_mask])
                {
                    
// Capture the char
                    
$buffer[++$i 7] = $utf_char substr($str$pos$utf_len);

                    
// Let's find out if a thorough check is needed
                    
if (isset($qc[$utf_char]))
                    {
                        
// If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
                    
}
                    else if (isset(
$utf_combining_class[$utf_char]))
                    {
                        if (
$utf_combining_class[$utf_char] < $last_cc)
                        {
                            
// A combining character that is NOT canonically ordered
                        
}
                        else
                        {
                            
// A combining character that IS canonically ordered, skip to the next char
                            
$last_cc $utf_combining_class[$utf_char];

                            
$pos += $utf_len;
                            continue;
                        }
                    }
                    else
                    {
                        
// At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
                        // It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
                        
$last_cc 0;

                        
// Check that we have the correct number of trailing bytes
                        
if (($utf_char $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
                        {
                            
// Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
                            // has been encoded in a five- or six- byte sequence
                            
if ($utf_char[0] >= "xF8")
                            {
                                if (
$utf_char[0] < "xFC")
                                {
                                    
$trailing_bytes 4;
                                }
                                else if (
$utf_char[0] > "xFD")
                                {
                                    
$trailing_bytes 0;
                                }
                                else
                                {
                                    
$trailing_bytes 5;
                                }
                            }
                            else
                            {
                                
$trailing_bytes $utf_len 1;
                            }

                            
$tmp .= substr($str$tmp_pos$pos $tmp_pos) . UTF8_REPLACEMENT;
                            
$pos += strspn($strUTF8_TRAILING_BYTES, ++$pos$trailing_bytes);
                            
$tmp_pos $pos;

                            continue;
                        }

                        if (isset(
$extra_check[$c]))
                        {
                            switch (
$c)
                            {
                                
// Note: 0xED is quite common in Korean
                                
case "xED":
                                    if (
$utf_char >= "xEDxA0x80")
                                    {
                                        
// Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
                                        
$tmp .= substr($str$tmp_pos$pos $tmp_pos) . UTF8_REPLACEMENT;
                                        
$pos += $utf_len;
                                        
$tmp_pos $pos;
                                        continue 
2;
                                    }
                                break;

                                
// Note: 0xEF is quite common in Japanese
                                
case "xEF":
                                    if (
$utf_char == "xEFxBFxBE" || $utf_char == "xEFxBFxBF")
                                    {
                                        
// U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
                                        
$tmp .= substr($str$tmp_pos$pos $tmp_pos) . UTF8_REPLACEMENT;
                                        
$pos += $utf_len;
                                        
$tmp_pos $pos;
                                        continue 
2;
                                    }
                                break;

                                case 
"xC0":
                                case 
"xC1":
                                    if (
$utf_char <= "xC1xBF")
                                    {
                                        
// Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
                                        
$tmp .= substr($str$tmp_pos$pos $tmp_pos) . UTF8_REPLACEMENT;
                                        
$pos += $utf_len;
                                        
$tmp_pos $pos;
                                        continue 
2;
                                    }
                                break;

                                case 
"xE0":
                                    if (
$utf_char <= "xE0x9FxBF")
                                    {
                                        
// Unicode char U+0000..U+07FF encoded in 3 bytes
                                        
$tmp .= substr($str$tmp_pos$pos $tmp_pos) . UTF8_REPLACEMENT;
                                        
$pos += $utf_len;
                                        
$tmp_pos $pos;
                                        continue 
2;
                                    }
                                break;

                                case 
"xF0":
                                    if (
$utf_char <= "xF0x8FxBFxBF")
                                    {
                                        
// Unicode char U+0000..U+FFFF encoded in 4 bytes
                                        
$tmp .= substr($str$tmp_pos$pos $tmp_pos) . UTF8_REPLACEMENT;
                                        
$pos += $utf_len;
                                        
$tmp_pos $pos;
                                        continue 
2;
                                    }
                                break;

                                default:
                                    
// Five- and six- byte sequences do not need being checked for here anymore
                                    
if ($utf_char UTF8_MAX)
                                    {
                                        
// Out of the Unicode range
                                        
if ($utf_char[0] < "xF8")
                                        {
                                            
$trailing_bytes 3;
                                        }
                                        else if (
$utf_char[0] < "xFC")
                                        {
                                            
$trailing_bytes 4;
                                        }
                                        else if (
$utf_char[0] > "xFD")
                                        {
                                            
$trailing_bytes 0;
                                        }
                                        else
                                        {
                                            
$trailing_bytes 5;
                                        }

                                        
$tmp .= substr($str$tmp_pos$pos $tmp_pos) . UTF8_REPLACEMENT;
                                        
$pos += strspn($strUTF8_TRAILING_BYTES, ++$pos$trailing_bytes);
                                        
$tmp_pos $pos;
                                        continue 
2;
                                    }
                                break;
                            }
                        }

                        
// The char is a valid starter, move the cursor and go on
                        
$pos += $utf_len;
                        continue;
                    }
                }
                else
                {
                    
// A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
                    // each of them was a Unicode replacement char
                    
$spn strspn($strUTF8_TRAILING_BYTES$pos);
                    
$tmp .= substr($str$tmp_pos$pos $tmp_pos) . str_repeat(UTF8_REPLACEMENT$spn);

                    
$pos += $spn;
                    
$tmp_pos $pos;
                    continue;
                }

                
// STEP 1: Decompose current char

                // We have found a character that is either:
                //  - in the NFC_QC/NFKC_QC list
                //  - a non-starter char that is not canonically ordered
                //
                // We are going to capture the shortest UTF sequence that satisfies these two conditions:
                //
                //  1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
                // and that starter must not have the NF[K]C_QC property equal to "MAYBE"
                //
                //  2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
                // immediately followed by a starter that is not on the QC list
                //
                
$utf_seq = array();
                
$last_cc 0;
                
$lpos $pos;
                
$pos += $utf_len;

                if (isset(
$decomp_map[$utf_char]))
                {
                    
$_pos 0;
                    
$_len strlen($decomp_map[$utf_char]);

                    do
                    {
                        
$_utf_len =& $utf_len_mask[$decomp_map[$utf_char][$_pos] & "xF0"];

                        if (isset(
$_utf_len))
                        {
                            
$utf_seq[] = substr($decomp_map[$utf_char], $_pos$_utf_len);
                            
$_pos += $_utf_len;
                        }
                        else
                        {
                            
$utf_seq[] = $decomp_map[$utf_char][$_pos];
                            ++
$_pos;
                        }
                    }
                    while (
$_pos $_len);
                }
                else
                {
                    
// The char is not decomposable
                    
$utf_seq = array($utf_char);
                }

                
// STEP 2: Capture the starter

                // Check out the combining class of the first character of the UTF sequence
                
$k 0;
                if (isset(
$utf_combining_class[$utf_seq[0]]) || $qc[$utf_char] == UNICODE_QC_MAYBE)
                {
                    
// Not a starter, inspect previous characters
                    // The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
                    // This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
                    // although it is slower than this method.
                    //
                    // In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
                    // at offset $i) and process them in backward mode until we find a starter.
                    //
                    // $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
                    // characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
                    
$starter_found 0;
                    
$j_min max(1$i 7);

                    for (
$j $i 1$j >= $j_min && $lpos $tmp_pos; --$j)
                    {
                        
$utf_char $buffer[$j 7];
                        
$lpos -= strlen($utf_char);

                        if (isset(
$decomp_map[$utf_char]))
                        {
                            
// The char is a composite, decompose for storage
                            
$decomp_seq = array();
                            
$_pos 0;
                            
$_len strlen($decomp_map[$utf_char]);

                            do
                            {
                                
$c $decomp_map[$utf_char][$_pos];
                                
$_utf_len =& $utf_len_mask[$c "xF0"];

                                if (isset(
$_utf_len))
                                {
                                    
$decomp_seq[] = substr($decomp_map[$utf_char], $_pos$_utf_len);
                                    
$_pos += $_utf_len;
                                }
                                else
                                {
                                    
$decomp_seq[] = $c;
                                    ++
$_pos;
                                }
                            }
                            while (
$_pos $_len);

                            
// Prepend the UTF sequence with our decomposed sequence
                            
if (isset($decomp_seq[1]))
                            {
                                
// The char expanded into several chars
                                
$decomp_cnt sizeof($decomp_seq);

                                foreach (
$decomp_seq as $decomp_i => $decomp_char)
                                {
                                    
$utf_seq[$k $decomp_i $decomp_cnt] = $decomp_char;
                                }
                                
$k -= $decomp_cnt;
                            }
                            else
                            {
                                
// Decomposed to a single char, easier to prepend
                                
$utf_seq[--$k] = $decomp_seq[0];
                            }
                        }
                        else
                        {
                            
$utf_seq[--$k] = $utf_char;
                        }

                        if (!isset(
$utf_combining_class[$utf_seq[$k]]))
                        {
                            
// We have found our starter
                            
$starter_found 1;
                            break;
                        }
                    }

                    if (!
$starter_found && $lpos $tmp_pos)
                    {
                        
// The starter was not found in the buffer, let's rewind some more
                        
do
                        {
                            
// $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
                            
$c $str[--$lpos];
                            
$c_mask $c "xF0";

                            if (isset(
$utf_len_mask[$c_mask]))
                            {
                                
// UTF byte
                                
if ($utf_len $utf_len_mask[$c_mask])
                                {
                                    
// UTF *leading* byte
                                    
$utf_char substr($str$lpos$utf_len);

                                    if (isset(
$decomp_map[$utf_char]))
                                    {
                                        
// Decompose the character
                                        
$decomp_seq = array();
                                        
$_pos 0;
                                        
$_len strlen($decomp_map[$utf_char]);

                                        do
                                        {
                                            
$c $decomp_map[$utf_char][$_pos];
                                            
$_utf_len =& $utf_len_mask[$c "xF0"];

                                            if (isset(
$_utf_len))
                                            {
                                                
$decomp_seq[] = substr($decomp_map[$utf_char], $_pos$_utf_len);
                                                
$_pos += $_utf_len;
                                            }
                                            else
                                            {
                                                
$decomp_seq[] = $c;
                                                ++
$_pos;
                                            }
                                        }
                                        while (
$_pos $_len);

                                        
// Prepend the UTF sequence with our decomposed sequence
                                        
if (isset($decomp_seq[1]))
                                        {
                                            
// The char expanded into several chars
                                            
$decomp_cnt sizeof($decomp_seq);
                                            foreach (
$decomp_seq as $decomp_i => $utf_char)
                                            {
                                                
$utf_seq[$k $decomp_i $decomp_cnt] = $utf_char;
                                            }
                                            
$k -= $decomp_cnt;
                                        }
                                        else
                                        {
                                            
// Decomposed to a single char, easier to prepend
                                            
$utf_seq[--$k] = $decomp_seq[0];
                                        }
                                    }
                                    else
                                    {
                                        
$utf_seq[--$k] = $utf_char;
                                    }
                                }
                            }
                            else
                            {
                                
// ASCII char
                                
$utf_seq[--$k] = $c;
                            }
                        }
                        while (
$lpos $tmp_pos);
                    }
                }

                
// STEP 3: Capture following combining modifiers

                
while ($pos $len)
                {
                    
$c_mask $str[$pos] & "xF0";

                    if (isset(
$utf_len_mask[$c_mask]))
                    {
                        if (
$utf_len $utf_len_mask[$c_mask])
                        {
                            
$utf_char substr($str$pos$utf_len);
                        }
                        else
                        {
                            
// A trailing byte came out of nowhere
                            // Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
                            // as if it was a starter (replacement chars ARE starters) and let the next loop replace it
                            
break;
                        }

                        if (isset(
$utf_combining_class[$utf_char]) || isset($qc[$utf_char]))
                        {
                            
// Combining character, add it to the sequence and move the cursor
                            
if (isset($decomp_map[$utf_char]))
                            {
                                
// Decompose the character
                                
$_pos 0;
                                
$_len strlen($decomp_map[$utf_char]);

                                do
                                {
                                    
$c $decomp_map[$utf_char][$_pos];
                                    
$_utf_len =& $utf_len_mask[$c "xF0"];

                                    if (isset(
$_utf_len))
                                    {
                                        
$utf_seq[] = substr($decomp_map[$utf_char], $_pos$_utf_len);
                                        
$_pos += $_utf_len;
                                    }
                                    else
                                    {
                                        
$utf_seq[] = $c;
                                        ++
$_pos;
                                    }
                                }
                                while (
$_pos $_len);
                            }
                            else
                            {
                                
$utf_seq[] = $utf_char;
                            }

                            
$pos += $utf_len;
                        }
                        else
                        {
                            
// Combining class 0 and no QC, break out of the loop
                            // Note: we do not know if that character is valid. If it's not, the next iteration will replace it
                            
break;
                        }
                    }
                    else
                    {
                        
// ASCII chars are starters
                        
break;
                    }
                }

                
// STEP 4: Sort and combine

                // Here we sort...
                
$k_max $k sizeof($utf_seq);

                if (!
$k && $k_max == 1)
                {
                    
// There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
                        // Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
//                        if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
//                        {
                        
$tmp .= substr($str$tmp_pos$lpos $tmp_pos) . $utf_seq[0];
                        
$tmp_pos $pos;
//                        }

                    
continue;
                }

                
// ...there we combine
                
if (isset($utf_combining_class[$utf_seq[$k]]))
                {
                    
$starter $nf_seq '';
                }
                else
                {
                    
$starter $utf_seq[$k++];
                    
$nf_seq '';
                }
                
$utf_sort = array();

                
// We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
                // at the end of the string without altering it
                
$utf_seq[] = '';

                do
                {
                    
$utf_char $utf_seq[$k++];

                    if (isset(
$utf_combining_class[$utf_char]))
                    {
                        
$utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
                    }
                    else
                    {
                        if (empty(
$utf_sort))
                        {
                            
// No combining characters... check for a composite of the two starters
                            
if (isset($utf_canonical_comp[$starter $utf_char]))
                            {
                                
// Good ol' composite character
                                
$starter $utf_canonical_comp[$starter $utf_char];
                            }
                            else if (isset(
$utf_jamo_type[$utf_char]))
                            {
                                
// Current char is a composable jamo
                                
if (isset($utf_jamo_type[$starter]) && $utf_jamo_type[$starter] == UNICODE_JAMO_L && $utf_jamo_type[$utf_char] == UNICODE_JAMO_V)
                                {
                                    
// We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
                                    
if (isset($utf_jamo_type[$utf_seq[$k]]) && $utf_jamo_type[$utf_seq[$k]] == UNICODE_JAMO_T)
                                    {
                                        
// L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
                                        
$cp $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char] + $utf_jamo_index[$utf_seq[$k]];
                                        ++
$k;
                                    }
                                    else
                                    {
                                        
// L+V jamos, combine to a LV Hangul syllable
                                        
$cp $utf_jamo_index[$starter] + $utf_jamo_index[$utf_char];
                                    }

                                    
$starter chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp 0x3F));
                                }
                                else
                                {
                                    
// Non-composable jamo, just add it to the sequence
                                    
$nf_seq .= $starter;
                                    
$starter $utf_char;
                                }
                            }
                            else
                            {
                                
// No composite, just add the first starter to the sequence then continue with the other one
                                
$nf_seq .= $starter;
                                
$starter $utf_char;
                            }
                        }
                        else
                        {
                            
ksort($utf_sort);

                            
// For each class of combining characters
                            
foreach ($utf_sort as $cc => $utf_chars)
                            {
                                
$j 0;

                                do
                                {
                                    
// Look for a composite
                                    
if (isset($utf_canonical_comp[$starter $utf_chars[$j]]))
                                    {
                                        
// Found a composite, replace the starter
                                        
$starter $utf_canonical_comp[$starter $utf_chars[$j]];
                                        unset(
$utf_sort[$cc][$j]);
                                    }
                                    else
                                    {
                                        
// No composite, all following characters in that class are blocked
                                        
break;
                                    }
                                }
                                while (isset(
$utf_sort[$cc][++$j]));
                            }

                            
// Add the starter to the normalized sequence, followed by non-starters in canonical order
                            
$nf_seq .= $starter;

                            foreach (
$utf_sort as $utf_chars)
                            {
                                if (!empty(
$utf_chars))
                                {
                                    
$nf_seq .= implode(''$utf_chars);
                                }
                            }

                            
// Reset the array and go on
                            
$utf_sort = array();
                            
$starter $utf_char;
                        }
                    }
                }
                while (
$k <= $k_max);

                
$tmp .= substr($str$tmp_pos$lpos $tmp_pos) . $nf_seq;
                
$tmp_pos $pos;
            }
            else
            {
                
// Only a ASCII char can make the program get here
                //
                // First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
                //
                // The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
                // multi-byte text (where the only ASCII chars are spaces and punctuation)
                
if (++$pos != $len)
                {
                    if (
$str[$pos] < "x80")
                    {
                        
$pos += strspn($strUTF8_ASCII_RANGE, ++$pos);
                        
$buffer[++$i 7] = $str[$pos 1];
                    }
                    else
                    {
                        
$buffer[++$i 7] = $c;
                    }
                }
            }
        }
        while (
$pos $len);

        
// Now is time to return the string
        
if ($tmp_pos)
        {
            
// If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
            
if ($tmp_pos == $len)
            {
                
// The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
                
return $tmp;
            }
            else
            {
                
// The rightmost chunk of $str has not been appended to $tmp yet
                
return $tmp substr($str$tmp_pos);
            }
        }

        
// The string was already in normal form
        
return $str;
    }

    
/**
    * Decompose a UTF string
    *
    * @param    string    $str            UTF string
    * @param    integer    $pos            Position of the first UTF char (in bytes)
    * @param    integer    $len            Length of the string (in bytes)
    * @param    array    &$decomp_map    Decomposition mapping, passed by reference but never modified
    * @return    string                    The string, decomposed and sorted canonically
    *
    * @access    private
    */
    
static function decompose($str$pos$len, &$decomp_map)
    {
        global 
$utf_combining_class;

        
// Load some commonly-used tables
        
if (!isset($utf_combining_class))
        {
            global 
$phpbb_root_path$phpEx;
            include(
$phpbb_root_path 'includes/utf/data/utf_normalizer_common.' $phpEx);
        }

        
// UTF char length array
        
$utf_len_mask = array(
            
// Leading bytes masks
            
"xC0" => 2"xD0" => 2"xE0" => 3"xF0" => 4,
            
// Trailing bytes masks
            
"x80" => 0"x90" => 0"xA0" => 0"xB0" => 0
        
);

        
// Some extra checks are triggered on the first byte of a UTF sequence
        
$extra_check = array(
            
"xED" => 1"xEF" => 1"xC0" => 1"xC1" => 1"xE0" => 1"xF0" => 1,
            
"xF4" => 1"xF5" => 1"xF6" => 1"xF7" => 1"xF8" => 1"xF9" => 1,
            
"xFA" => 1"xFB" => 1"xFC" => 1"xFD" => 1"xFE" => 1"xFF" => 1
        
);

        
// These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
        //   - 2-byte: 110? ???? 10?? ????
        //   - 3-byte: 1110 ???? 10?? ???? 10?? ????
        //   - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
        // Note that 5- and 6- byte sequences are automatically discarded
        
$utf_validation_mask = array(
            
2    => "xE0xC0",
            
3    => "xF0xC0xC0",
            
4    => "xF8xC0xC0xC0"
        
);

        
$utf_validation_check = array(
            
2    => "xC0x80",
            
3    => "xE0x80x80",
            
4    => "xF0x80x80x80"
        
);

        
$tmp '';
        
$starter_pos $pos;
        
$tmp_pos $last_cc $sort $dump 0;
        
$utf_sort = array();

        
// Main loop
        
do
        {
            
// STEP 0: Capture the current char

            
$cur_mask $str[$pos] & "xF0";
            if (isset(
$utf_len_mask[$cur_mask]))
            {
                if (
$utf_len $utf_len_mask[$cur_mask])
                {
                    
// Multibyte char
                    
$utf_char substr($str$pos$utf_len);
                    
$pos += $utf_len;
                }
                else
                {
                    
// A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
                    // replacement char and we will advance the cursor
                    
$spn strspn($strUTF8_TRAILING_BYTES$pos);

                    if (
$dump)
                    {
                        
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                        
// Dump combiners
                        
if (!empty($utf_sort))
                        {
                            if (
$sort)
                            {
                                
ksort($utf_sort);
                            }

                            foreach (
$utf_sort as $utf_chars)
                            {
                                
$tmp .= implode(''$utf_chars);
                            }
                        }

                        
$tmp .= str_repeat(UTF8_REPLACEMENT$spn);
                        
$dump $sort 0;
                    }
                    else
                    {
                        
$tmp .= substr($str$tmp_pos$pos $tmp_pos) . str_repeat(UTF8_REPLACEMENT$spn);
                    }

                    
$pos += $spn;
                    
$tmp_pos $starter_pos $pos;

                    
$utf_sort = array();
                    
$last_cc 0;

                    continue;
                }

                
// STEP 1: Decide what to do with current char

                // Now, in that order:
                //  - check if that character is decomposable
                //  - check if that character is a non-starter
                //  - check if that character requires extra checks to be performed
                
if (isset($decomp_map[$utf_char]))
                {
                    
// Decompose the char
                    
$_pos 0;
                    
$_len strlen($decomp_map[$utf_char]);

                    do
                    {
                        
$c $decomp_map[$utf_char][$_pos];
                        
$_utf_len =& $utf_len_mask[$c "xF0"];

                        if (isset(
$_utf_len))
                        {
                            
$_utf_char substr($decomp_map[$utf_char], $_pos$_utf_len);
                            
$_pos += $_utf_len;

                            if (isset(
$utf_combining_class[$_utf_char]))
                            {
                                
// The character decomposed to a non-starter, buffer it for sorting
                                
$utf_sort[$utf_combining_class[$_utf_char]][] = $_utf_char;

                                if (
$utf_combining_class[$_utf_char] < $last_cc)
                                {
                                    
// Not canonically ordered, will require sorting
                                    
$sort $dump 1;
                                }
                                else
                                {
                                    
$dump 1;
                                    
$last_cc $utf_combining_class[$_utf_char];
                                }
                            }
                            else
                            {
                                
// This character decomposition contains a starter, dump the buffer and continue
                                
if ($dump)
                                {
                                    
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                                    
// Dump combiners
                                    
if (!empty($utf_sort))
                                    {
                                        if (
$sort)
                                        {
                                            
ksort($utf_sort);
                                        }

                                        foreach (
$utf_sort as $utf_chars)
                                        {
                                            
$tmp .= implode(''$utf_chars);
                                        }
                                    }

                                    
$tmp .= $_utf_char;
                                    
$dump $sort 0;
                                }
                                else
                                {
                                    
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos) . $_utf_char;
                                }

                                
$tmp_pos $starter_pos $pos;
                                
$utf_sort = array();
                                
$last_cc 0;
                            }
                        }
                        else
                        {
                            
// This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
                            
++$_pos;

                            if (
$dump)
                            {
                                
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                                
// Dump combiners
                                
if (!empty($utf_sort))
                                {
                                    if (
$sort)
                                    {
                                        
ksort($utf_sort);
                                    }

                                    foreach (
$utf_sort as $utf_chars)
                                    {
                                        
$tmp .= implode(''$utf_chars);
                                    }
                                }

                                
$tmp .= $c;
                                
$dump $sort 0;
                            }
                            else
                            {
                                
$tmp .= substr($str$tmp_pos$pos $utf_len $tmp_pos) . $c;
                            }

                            
$tmp_pos $starter_pos $pos;
                            
$utf_sort = array();
                            
$last_cc 0;
                        }
                    }
                    while (
$_pos $_len);
                }
                else if (isset(
$utf_combining_class[$utf_char]))
                {
                    
// Combining character
                    
if ($utf_combining_class[$utf_char] < $last_cc)
                    {
                        
// Not in canonical order
                        
$sort $dump 1;
                    }
                    else
                    {
                        
$last_cc $utf_combining_class[$utf_char];
                    }

                    
$utf_sort[$utf_combining_class[$utf_char]][] = $utf_char;
                }
                else
                {
                    
// Non-decomposable starter, check out if it's a Hangul syllable
                    
if ($utf_char UTF8_HANGUL_FIRST || $utf_char UTF8_HANGUL_LAST)
                    {
                        
// Nope, regular UTF char, check that we have the correct number of trailing bytes
                        
if (($utf_char $utf_validation_mask[$utf_len]) != $utf_validation_check[$utf_len])
                        {
                            
// Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
                            // has been encoded in a five- or six- byte sequence.
                            // Move the cursor back to its original position then advance it to the position it should really be at
                            
$pos -= $utf_len;
                            
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                            if (!empty(
$utf_sort))
                            {
                                
ksort($utf_sort);

                                foreach (
$utf_sort as $utf_chars)
                                {
                                    
$tmp .= implode(''$utf_chars);
                                }
                                
$utf_sort = array();
                            }

                            
// Add a replacement char then another replacement char for every trailing byte.
                            //
                            // @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
                            
$spn strspn($strUTF8_TRAILING_BYTES, ++$pos);
                            
$tmp .= str_repeat(UTF8_REPLACEMENT$spn 1);

                            
$dump $sort 0;

                            
$pos += $spn;
                            
$tmp_pos $pos;
                            continue;
                        }

                        if (isset(
$extra_check[$utf_char[0]]))
                        {
                            switch (
$utf_char[0])
                            {
                                
// Note: 0xED is quite common in Korean
                                
case "xED":
                                    if (
$utf_char >= "xEDxA0x80")
                                    {
                                        
// Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
                                        
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                                        if (!empty(
$utf_sort))
                                        {
                                            
ksort($utf_sort);

                                            foreach (
$utf_sort as $utf_chars)
                                            {
                                                
$tmp .= implode(''$utf_chars);
                                            }
                                            
$utf_sort = array();
                                        }

                                        
$tmp .= UTF8_REPLACEMENT;
                                        
$dump $sort 0;

                                        
$tmp_pos $starter_pos $pos;
                                        continue 
2;
                                    }
                                break;

                                
// Note: 0xEF is quite common in Japanese
                                
case "xEF":
                                    if (
$utf_char == "xEFxBFxBE" || $utf_char == "xEFxBFxBF")
                                    {
                                        
// U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
                                        
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                                        if (!empty(
$utf_sort))
                                        {
                                            
ksort($utf_sort);

                                            foreach (
$utf_sort as $utf_chars)
                                            {
                                                
$tmp .= implode(''$utf_chars);
                                            }
                                            
$utf_sort = array();
                                        }

                                        
$tmp .= UTF8_REPLACEMENT;
                                        
$dump $sort 0;

                                        
$tmp_pos $starter_pos $pos;
                                        continue 
2;
                                    }
                                break;

                                case 
"xC0":
                                case 
"xC1":
                                    if (
$utf_char <= "xC1xBF")
                                    {
                                        
// Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
                                        
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                                        if (!empty(
$utf_sort))
                                        {
                                            
ksort($utf_sort);

                                            foreach (
$utf_sort as $utf_chars)
                                            {
                                                
$tmp .= implode(''$utf_chars);
                                            }
                                            
$utf_sort = array();
                                        }

                                        
$tmp .= UTF8_REPLACEMENT;
                                        
$dump $sort 0;

                                        
$tmp_pos $starter_pos $pos;
                                        continue 
2;
                                    }
                                break;

                                case 
"xE0":
                                    if (
$utf_char <= "xE0x9FxBF")
                                    {
                                        
// Unicode char U+0000..U+07FF encoded in 3 bytes
                                        
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                                        if (!empty(
$utf_sort))
                                        {
                                            
ksort($utf_sort);

                                            foreach (
$utf_sort as $utf_chars)
                                            {
                                                
$tmp .= implode(''$utf_chars);
                                            }
                                            
$utf_sort = array();
                                        }

                                        
$tmp .= UTF8_REPLACEMENT;
                                        
$dump $sort 0;

                                        
$tmp_pos $starter_pos $pos;
                                        continue 
2;
                                    }
                                break;

                                case 
"xF0":
                                    if (
$utf_char <= "xF0x8FxBFxBF")
                                    {
                                        
// Unicode char U+0000..U+FFFF encoded in 4 bytes
                                        
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                                        if (!empty(
$utf_sort))
                                        {
                                            
ksort($utf_sort);

                                            foreach (
$utf_sort as $utf_chars)
                                            {
                                                
$tmp .= implode(''$utf_chars);
                                            }
                                            
$utf_sort = array();
                                        }

                                        
$tmp .= UTF8_REPLACEMENT;
                                        
$dump $sort 0;

                                        
$tmp_pos $starter_pos $pos;
                                        continue 
2;
                                    }
                                break;

                                default:
                                    if (
$utf_char UTF8_MAX)
                                    {
                                        
// Out of the Unicode range
                                        
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                                        if (!empty(
$utf_sort))
                                        {
                                            
ksort($utf_sort);

                                            foreach (
$utf_sort as $utf_chars)
                                            {
                                                
$tmp .= implode(''$utf_chars);
                                            }
                                            
$utf_sort = array();
                                        }

                                        
$tmp .= UTF8_REPLACEMENT;
                                        
$dump $sort 0;

                                        
$tmp_pos $starter_pos $pos;
                                        continue 
2;
                                    }
                                break;
                            }
                        }
                    }
                    else
                    {
                        
// Hangul syllable
                        
$idx = (((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F)) - UNICODE_HANGUL_SBASE;

                        
// LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
                        //
                        // The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
                        
if ($t_index $idx UNICODE_HANGUL_TCOUNT)
                        {
                            if (
$t_index 25)
                            {
                                
$utf_char "xE1x84x00xE1x85x00xE1x86x00";
                                
$utf_char[8] = chr(0xA7 $t_index);
                            }
                            else
                            {
                                
$utf_char "xE1x84x00xE1x85x00xE1x87x00";
                                
$utf_char[8] = chr(0x67 $t_index);
                            }
                        }
                        else
                        {
                            
$utf_char "xE1x84x00xE1x85x00";
                        }

                        
$utf_char[2] = chr(0x80 + (int) ($idx UNICODE_HANGUL_NCOUNT));
                        
$utf_char[5] = chr(0xA1 + (int) (($idx UNICODE_HANGUL_NCOUNT) / UNICODE_HANGUL_TCOUNT));

                        
// Just like other decompositions, the resulting Jamos must be dumped to the tmp string
                        
$dump 1;
                    }

                    
// Do we need to dump stuff to the tmp string?
                    
if ($dump)
                    {
                        
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                        
// Dump combiners
                        
if (!empty($utf_sort))
                        {
                            if (
$sort)
                            {
                                
ksort($utf_sort);
                            }

                            foreach (
$utf_sort as $utf_chars)
                            {
                                
$tmp .= implode(''$utf_chars);
                            }
                        }

                        
$tmp .= $utf_char;
                        
$dump $sort 0;
                        
$tmp_pos $pos;
                    }

                    
$last_cc 0;
                    
$utf_sort = array();
                    
$starter_pos $pos;
                }
            }
            else
            {
                
// ASCII char, which happens to be a starter (as any other ASCII char)
                
if ($dump)
                {
                    
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

                    
// Dump combiners
                    
if (!empty($utf_sort))
                    {
                        if (
$sort)
                        {
                            
ksort($utf_sort);
                        }

                        foreach (
$utf_sort as $utf_chars)
                        {
                            
$tmp .= implode(''$utf_chars);
                        }
                    }

                    
$tmp .= $str[$pos];
                    
$dump $sort 0;
                    
$tmp_pos = ++$pos;

                    
$pos += strspn($strUTF8_ASCII_RANGE$pos);
                }
                else
                {
                    
$pos += strspn($strUTF8_ASCII_RANGE, ++$pos);
                }

                
$last_cc 0;
                
$utf_sort = array();
                
$starter_pos $pos;
            }
        }
        while (
$pos $len);

        
// Now is time to return the string
        
if ($dump)
        {
            
$tmp .= substr($str$tmp_pos$starter_pos $tmp_pos);

            
// Dump combiners
            
if (!empty($utf_sort))
            {
                if (
$sort)
                {
                    
ksort($utf_sort);
                }

                foreach (
$utf_sort as $utf_chars)
                {
                    
$tmp .= implode(''$utf_chars);
                }
            }

            return 
$tmp;
        }
        else if (
$tmp_pos)
        {
            
// If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
            
if ($tmp_pos == $len)
            {
                
// The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
                
return $tmp;
            }
            else
            {
                
// The rightmost chunk of $str has not been appended to $tmp yet
                
return $tmp substr($str$tmp_pos);
            }
        }

        
// The string was already in normal form
        
return $str;
    }
}
Онлайн: 3
Реклама