Файл: public_html/core/UTF8.php
Строк: 2892
<?php
/**
* PHP5 UTF-8 is a UTF-8 aware library of functions mirroring PHP's own string functions.
*
* The powerful solution/contribution for UTF-8 support in your framework/CMS, written on PHP.
* This library is advance of http://sourceforge.net/projects/phputf8 (last updated in 2007).
*
* UTF-8 support in PHP 5.
*
* Features and benefits of using this class
* * Compatibility with the interface standard PHP functions that deal with single-byte encodings
* * Ability to work without PHP extensions ICONV and MBSTRING, if any, that are actively used!
* * Useful features are missing from the ICONV and MBSTRING
* * The methods that take and return a string, are able to take and return null (useful for selects from a database)
* * Several methods are able to process arrays recursively
* * A single interface and encapsulation (you can inherit and override)
* * High performance, reliability and quality code
* * PHP> = 5.3.x
*
* In Russian:
*
* Поддержка UTF-8 в PHP 5.
*
* Возможности и преимущества использования этого класса
* * Совместимость с интерфейсом стандартных PHP функций, работающих с однобайтовыми кодировками
* * Возможность работы без PHP расширений ICONV и MBSTRING, если они есть, то активно используются!
* * Полезные функции, отсутствующие в ICONV и MBSTRING
* * Методы, которые принимают и возвращают строку, умеют принимать и возвращать null (удобно при выборках значений из базы данных)
* * Несколько методов умеют обрабатывать массивы рекурсивно
* * Единый интерфейс и инкапсуляция (можно унаследоваться и переопределить методы)
* * Высокая производительность, надёжность и качественный код
* * PHP >= 5.3.x
*
* Example:
* $s = 'Hello, Привет';
* if (UTF8::is_utf8($s)) echo UTF8::strlen($s);
*
* UTF-8 encoding scheme:
* 2^7 0x00000000 — 0x0000007F 0xxxxxxx
* 2^11 0x00000080 — 0x000007FF 110xxxxx 10xxxxxx
* 2^16 0x00000800 — 0x0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
* 2^21 0x00010000 — 0x001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 1-4 bytes length: 2^7 + 2^11 + 2^16 + 2^21 = 2 164 864
*
* If I was a owner of the world, I would leave only 2 encoding: UTF-8 and UTF-32 ;-)
*
* Useful links
* http://ru.wikipedia.org/wiki/UTF8
* http://www.madore.org/~david/misc/unitest/ A Unicode Test Page
* http://www.unicode.org/
* http://www.unicode.org/reports/
* http://www.unicode.org/reports/tr10/ Unicode Collation Algorithm
* http://www.unicode.org/Public/UCA/6.0.0/ Unicode Collation Algorithm
* http://www.unicode.org/reports/tr6/ A Standard Compression Scheme for Unicode
* http://www.fileformat.info/info/unicode/char/search.htm Unicode Character Search
*
* @link http://code.google.com/p/php5-utf8/
* @license http://creativecommons.org/licenses/by-sa/3.0/
* @author Nasibullin Rinat
* @version 2.2.2
*/
class UTF8
{
#REPLACEMENT CHARACTER (for broken char)
const REPLACEMENT_CHAR = "xEFxBFxBD"; #U+FFFD
/**
* Regular expression for a character in UTF-8 without the use of a flag /u
* @deprecated Instead, use a dot (".") and the flag /u, it works faster!
* @var string
*/
public static $char_re = ' [x09x0Ax0Dx20-x7E] # ASCII strict
# [x00-x7F] # ASCII non-strict (including control chars)
| [xC2-xDF][x80-xBF] # non-overlong 2-byte
| xE0[xA0-xBF][x80-xBF] # excluding overlongs
| [xE1-xECxEExEF][x80-xBF]{2} # straight 3-byte
| xED[x80-x9F][x80-xBF] # excluding surrogates
| xF0[x90-xBF][x80-xBF]{2} # planes 1-3
| [xF1-xF3][x80-xBF]{3} # planes 4-15
| xF4[x80-x8F][x80-xBF]{2} # plane 16
';
/**
* Combining diactrical marks (Unicode 5.1).
*
* For example, russian letters in composed form: "Ё" (U+0401), "Й" (U+0419),
* decomposed form: (U+0415 U+0308), (U+0418 U+0306)
*
* @link http://www.unicode.org/charts/PDF/U0300.pdf
* @link http://www.unicode.org/charts/PDF/U1DC0.pdf
* @link http://www.unicode.org/charts/PDF/UFE20.pdf
* @var string
*/
#public static $diactrical_re = 'p{M}'; #alternative, but only with /u flag
public static $diactrical_re = ' xcc[x80-xb9]|xcd[x80-xaf] #UNICODE range: U+0300 — U+036F (for letters)
| xe2x83[x90-xbf] #UNICODE range: U+20D0 — U+20FF (for symbols)
| xe1xb7[x80-xbf] #UNICODE range: U+1DC0 — U+1DFF (supplement)
| xefxb8[xa0-xaf] #UNICODE range: U+FE20 — U+FE2F (combining half marks)
';
/**
* @var array
*/
public static $html_special_chars_table = array(
'"' => "x22", #U+0022 ["] " quotation mark = APL quote
'&' => "x26", #U+0026 [&] & ampersand
'<' => "x3c", #U+003C [<] < less-than sign
'>' => "x3e", #U+003E [>] > greater-than sign
);
/**
* @link http://www.fileformat.info/format/w3c/entitytest.htm?sort=Unicode%20Character HTML Entity Browser Test Page
* @var array
*/
public static $html_entity_table = array(
#Latin-1 Entities:
' ' => "xc2xa0", #U+00A0 [ ] no-break space = non-breaking space
'¡' => "xc2xa1", #U+00A1 [¡] inverted exclamation mark
'¢' => "xc2xa2", #U+00A2 [¢] cent sign
'£' => "xc2xa3", #U+00A3 [£] pound sign
'¤' => "xc2xa4", #U+00A4 [¤] currency sign
'¥' => "xc2xa5", #U+00A5 [¥] yen sign = yuan sign
'¦' => "xc2xa6", #U+00A6 [¦] broken bar = broken vertical bar
'§' => "xc2xa7", #U+00A7 [§] section sign
'¨' => "xc2xa8", #U+00A8 [¨] diaeresis = spacing diaeresis
'©' => "xc2xa9", #U+00A9 [©] copyright sign
'ª' => "xc2xaa", #U+00AA [ª] feminine ordinal indicator
'«' => "xc2xab", #U+00AB [«] left-pointing double angle quotation mark = left pointing guillemet
'¬' => "xc2xac", #U+00AC [¬] not sign
'­' => "xc2xad", #U+00AD [ ] soft hyphen = discretionary hyphen
'®' => "xc2xae", #U+00AE [®] registered sign = registered trade mark sign
'¯' => "xc2xaf", #U+00AF [¯] macron = spacing macron = overline = APL overbar
'°' => "xc2xb0", #U+00B0 [°] degree sign
'±' => "xc2xb1", #U+00B1 [±] plus-minus sign = plus-or-minus sign
'²' => "xc2xb2", #U+00B2 [²] superscript two = superscript digit two = squared
'³' => "xc2xb3", #U+00B3 [³] superscript three = superscript digit three = cubed
'´' => "xc2xb4", #U+00B4 [´] acute accent = spacing acute
'µ' => "xc2xb5", #U+00B5 [µ] micro sign
'¶' => "xc2xb6", #U+00B6 [¶] pilcrow sign = paragraph sign
'·' => "xc2xb7", #U+00B7 [·] middle dot = Georgian comma = Greek middle dot
'¸' => "xc2xb8", #U+00B8 [¸] cedilla = spacing cedilla
'¹' => "xc2xb9", #U+00B9 [¹] superscript one = superscript digit one
'º' => "xc2xba", #U+00BA [º] masculine ordinal indicator
'»' => "xc2xbb", #U+00BB [»] right-pointing double angle quotation mark = right pointing guillemet
'¼' => "xc2xbc", #U+00BC [¼] vulgar fraction one quarter = fraction one quarter
'½' => "xc2xbd", #U+00BD [½] vulgar fraction one half = fraction one half
'¾' => "xc2xbe", #U+00BE [¾] vulgar fraction three quarters = fraction three quarters
'¿' => "xc2xbf", #U+00BF [¿] inverted question mark = turned question mark
#Latin capital letter
'À' => "xc3x80", #Latin capital letter A with grave = Latin capital letter A grave
'Á' => "xc3x81", #Latin capital letter A with acute
'Â' => "xc3x82", #Latin capital letter A with circumflex
'Ã' => "xc3x83", #Latin capital letter A with tilde
'Ä' => "xc3x84", #Latin capital letter A with diaeresis
'Å' => "xc3x85", #Latin capital letter A with ring above = Latin capital letter A ring
'Æ' => "xc3x86", #Latin capital letter AE = Latin capital ligature AE
'Ç' => "xc3x87", #Latin capital letter C with cedilla
'È' => "xc3x88", #Latin capital letter E with grave
'É' => "xc3x89", #Latin capital letter E with acute
'Ê' => "xc3x8a", #Latin capital letter E with circumflex
'Ë' => "xc3x8b", #Latin capital letter E with diaeresis
'Ì' => "xc3x8c", #Latin capital letter I with grave
'Í' => "xc3x8d", #Latin capital letter I with acute
'Î' => "xc3x8e", #Latin capital letter I with circumflex
'Ï' => "xc3x8f", #Latin capital letter I with diaeresis
'Ð' => "xc3x90", #Latin capital letter ETH
'Ñ' => "xc3x91", #Latin capital letter N with tilde
'Ò' => "xc3x92", #Latin capital letter O with grave
'Ó' => "xc3x93", #Latin capital letter O with acute
'Ô' => "xc3x94", #Latin capital letter O with circumflex
'Õ' => "xc3x95", #Latin capital letter O with tilde
'Ö' => "xc3x96", #Latin capital letter O with diaeresis
'×' => "xc3x97", #U+00D7 [×] multiplication sign
'Ø' => "xc3x98", #Latin capital letter O with stroke = Latin capital letter O slash
'Ù' => "xc3x99", #Latin capital letter U with grave
'Ú' => "xc3x9a", #Latin capital letter U with acute
'Û' => "xc3x9b", #Latin capital letter U with circumflex
'Ü' => "xc3x9c", #Latin capital letter U with diaeresis
'Ý' => "xc3x9d", #Latin capital letter Y with acute
'Þ' => "xc3x9e", #Latin capital letter THORN
#Latin small letter
'ß' => "xc3x9f", #Latin small letter sharp s = ess-zed
'à' => "xc3xa0", #Latin small letter a with grave = Latin small letter a grave
'á' => "xc3xa1", #Latin small letter a with acute
'â' => "xc3xa2", #Latin small letter a with circumflex
'ã' => "xc3xa3", #Latin small letter a with tilde
'ä' => "xc3xa4", #Latin small letter a with diaeresis
'å' => "xc3xa5", #Latin small letter a with ring above = Latin small letter a ring
'æ' => "xc3xa6", #Latin small letter ae = Latin small ligature ae
'ç' => "xc3xa7", #Latin small letter c with cedilla
'è' => "xc3xa8", #Latin small letter e with grave
'é' => "xc3xa9", #Latin small letter e with acute
'ê' => "xc3xaa", #Latin small letter e with circumflex
'ë' => "xc3xab", #Latin small letter e with diaeresis
'ì' => "xc3xac", #Latin small letter i with grave
'í' => "xc3xad", #Latin small letter i with acute
'î' => "xc3xae", #Latin small letter i with circumflex
'ï' => "xc3xaf", #Latin small letter i with diaeresis
'ð' => "xc3xb0", #Latin small letter eth
'ñ' => "xc3xb1", #Latin small letter n with tilde
'ò' => "xc3xb2", #Latin small letter o with grave
'ó' => "xc3xb3", #Latin small letter o with acute
'ô' => "xc3xb4", #Latin small letter o with circumflex
'õ' => "xc3xb5", #Latin small letter o with tilde
'ö' => "xc3xb6", #Latin small letter o with diaeresis
'÷' => "xc3xb7", #U+00F7 [÷] division sign
'ø' => "xc3xb8", #Latin small letter o with stroke = Latin small letter o slash
'ù' => "xc3xb9", #Latin small letter u with grave
'ú' => "xc3xba", #Latin small letter u with acute
'û' => "xc3xbb", #Latin small letter u with circumflex
'ü' => "xc3xbc", #Latin small letter u with diaeresis
'ý' => "xc3xbd", #Latin small letter y with acute
'þ' => "xc3xbe", #Latin small letter thorn
'ÿ' => "xc3xbf", #Latin small letter y with diaeresis
#Symbols and Greek Letters:
'ƒ' => "xc6x92", #U+0192 [ƒ] Latin small f with hook = function = florin
'Α' => "xcex91", #Greek capital letter alpha
'Β' => "xcex92", #Greek capital letter beta
'Γ' => "xcex93", #Greek capital letter gamma
'Δ' => "xcex94", #Greek capital letter delta
'Ε' => "xcex95", #Greek capital letter epsilon
'Ζ' => "xcex96", #Greek capital letter zeta
'Η' => "xcex97", #Greek capital letter eta
'Θ' => "xcex98", #Greek capital letter theta
'Ι' => "xcex99", #Greek capital letter iota
'Κ' => "xcex9a", #Greek capital letter kappa
'Λ' => "xcex9b", #Greek capital letter lambda
'Μ' => "xcex9c", #Greek capital letter mu
'Ν' => "xcex9d", #Greek capital letter nu
'Ξ' => "xcex9e", #Greek capital letter xi
'Ο' => "xcex9f", #Greek capital letter omicron
'Π' => "xcexa0", #Greek capital letter pi
'Ρ' => "xcexa1", #Greek capital letter rho
'Σ' => "xcexa3", #Greek capital letter sigma
'Τ' => "xcexa4", #Greek capital letter tau
'Υ' => "xcexa5", #Greek capital letter upsilon
'Φ' => "xcexa6", #Greek capital letter phi
'Χ' => "xcexa7", #Greek capital letter chi
'Ψ' => "xcexa8", #Greek capital letter psi
'Ω' => "xcexa9", #Greek capital letter omega
'α' => "xcexb1", #Greek small letter alpha
'β' => "xcexb2", #Greek small letter beta
'γ' => "xcexb3", #Greek small letter gamma
'δ' => "xcexb4", #Greek small letter delta
'ε' => "xcexb5", #Greek small letter epsilon
'ζ' => "xcexb6", #Greek small letter zeta
'η' => "xcexb7", #Greek small letter eta
'θ' => "xcexb8", #Greek small letter theta
'ι' => "xcexb9", #Greek small letter iota
'κ' => "xcexba", #Greek small letter kappa
'λ' => "xcexbb", #Greek small letter lambda
'μ' => "xcexbc", #Greek small letter mu
'ν' => "xcexbd", #Greek small letter nu
'ξ' => "xcexbe", #Greek small letter xi
'ο' => "xcexbf", #Greek small letter omicron
'π' => "xcfx80", #Greek small letter pi
'ρ' => "xcfx81", #Greek small letter rho
'ς' => "xcfx82", #Greek small letter final sigma
'σ' => "xcfx83", #Greek small letter sigma
'τ' => "xcfx84", #Greek small letter tau
'υ' => "xcfx85", #Greek small letter upsilon
'φ' => "xcfx86", #Greek small letter phi
'χ' => "xcfx87", #Greek small letter chi
'ψ' => "xcfx88", #Greek small letter psi
'ω' => "xcfx89", #Greek small letter omega
'ϑ'=> "xcfx91", #Greek small letter theta symbol
'ϒ' => "xcfx92", #Greek upsilon with hook symbol
'ϖ' => "xcfx96", #U+03D6 [ϖ] Greek pi symbol
'•' => "xe2x80xa2", #U+2022 [•] bullet = black small circle
'…' => "xe2x80xa6", #U+2026 […] horizontal ellipsis = three dot leader
'′' => "xe2x80xb2", #U+2032 [′] prime = minutes = feet (для обозначения минут и футов)
'″' => "xe2x80xb3", #U+2033 [″] double prime = seconds = inches (для обозначения секунд и дюймов).
'‾' => "xe2x80xbe", #U+203E [‾] overline = spacing overscore
'⁄' => "xe2x81x84", #U+2044 [⁄] fraction slash
'℘' => "xe2x84x98", #U+2118 [℘] script capital P = power set = Weierstrass p
'ℑ' => "xe2x84x91", #U+2111 [ℑ] blackletter capital I = imaginary part
'ℜ' => "xe2x84x9c", #U+211C [ℜ] blackletter capital R = real part symbol
'™' => "xe2x84xa2", #U+2122 [™] trade mark sign
'ℵ' => "xe2x84xb5", #U+2135 [ℵ] alef symbol = first transfinite cardinal
'←' => "xe2x86x90", #U+2190 [←] leftwards arrow
'↑' => "xe2x86x91", #U+2191 [↑] upwards arrow
'→' => "xe2x86x92", #U+2192 [→] rightwards arrow
'↓' => "xe2x86x93", #U+2193 [↓] downwards arrow
'↔' => "xe2x86x94", #U+2194 [↔] left right arrow
'↵' => "xe2x86xb5", #U+21B5 [↵] downwards arrow with corner leftwards = carriage return
'⇐' => "xe2x87x90", #U+21D0 [⇐] leftwards double arrow
'⇑' => "xe2x87x91", #U+21D1 [⇑] upwards double arrow
'⇒' => "xe2x87x92", #U+21D2 [⇒] rightwards double arrow
'⇓' => "xe2x87x93", #U+21D3 [⇓] downwards double arrow
'⇔' => "xe2x87x94", #U+21D4 [⇔] left right double arrow
'∀' => "xe2x88x80", #U+2200 [∀] for all
'∂' => "xe2x88x82", #U+2202 [∂] partial differential
'∃' => "xe2x88x83", #U+2203 [∃] there exists
'∅' => "xe2x88x85", #U+2205 [∅] empty set = null set = diameter
'∇' => "xe2x88x87", #U+2207 [∇] nabla = backward difference
'∈' => "xe2x88x88", #U+2208 [∈] element of
'∉' => "xe2x88x89", #U+2209 [∉] not an element of
'∋' => "xe2x88x8b", #U+220B [∋] contains as member
'∏' => "xe2x88x8f", #U+220F [∏] n-ary product = product sign
'∑' => "xe2x88x91", #U+2211 [∑] n-ary sumation
'−' => "xe2x88x92", #U+2212 [−] minus sign
'∗' => "xe2x88x97", #U+2217 [∗] asterisk operator
'√' => "xe2x88x9a", #U+221A [√] square root = radical sign
'∝' => "xe2x88x9d", #U+221D [∝] proportional to
'∞' => "xe2x88x9e", #U+221E [∞] infinity
'∠' => "xe2x88xa0", #U+2220 [∠] angle
'∧' => "xe2x88xa7", #U+2227 [∧] logical and = wedge
'∨' => "xe2x88xa8", #U+2228 [∨] logical or = vee
'∩' => "xe2x88xa9", #U+2229 [∩] intersection = cap
'∪' => "xe2x88xaa", #U+222A [∪] union = cup
'∫' => "xe2x88xab", #U+222B [∫] integral
'∴' => "xe2x88xb4", #U+2234 [∴] therefore
'∼' => "xe2x88xbc", #U+223C [∼] tilde operator = varies with = similar to
'≅' => "xe2x89x85", #U+2245 [≅] approximately equal to
'≈' => "xe2x89x88", #U+2248 [≈] almost equal to = asymptotic to
'≠' => "xe2x89xa0", #U+2260 [≠] not equal to
'≡' => "xe2x89xa1", #U+2261 [≡] identical to
'≤' => "xe2x89xa4", #U+2264 [≤] less-than or equal to
'≥' => "xe2x89xa5", #U+2265 [≥] greater-than or equal to
'⊂' => "xe2x8ax82", #U+2282 [⊂] subset of
'⊃' => "xe2x8ax83", #U+2283 [⊃] superset of
'⊄' => "xe2x8ax84", #U+2284 [⊄] not a subset of
'⊆' => "xe2x8ax86", #U+2286 [⊆] subset of or equal to
'⊇' => "xe2x8ax87", #U+2287 [⊇] superset of or equal to
'⊕' => "xe2x8ax95", #U+2295 [⊕] circled plus = direct sum
'⊗' => "xe2x8ax97", #U+2297 [⊗] circled times = vector product
'⊥' => "xe2x8axa5", #U+22A5 [⊥] up tack = orthogonal to = perpendicular
'⋅' => "xe2x8bx85", #U+22C5 [⋅] dot operator
'⌈' => "xe2x8cx88", #U+2308 [⌈] left ceiling = APL upstile
'⌉' => "xe2x8cx89", #U+2309 [⌉] right ceiling
'⌊' => "xe2x8cx8a", #U+230A [⌊] left floor = APL downstile
'⌋' => "xe2x8cx8b", #U+230B [⌋] right floor
'⟨' => "xe2x8cxa9", #U+2329 [〈] left-pointing angle bracket = bra
'⟩' => "xe2x8cxaa", #U+232A [〉] right-pointing angle bracket = ket
'◊' => "xe2x97x8a", #U+25CA [◊] lozenge
'♠' => "xe2x99xa0", #U+2660 [♠] black spade suit
'♣' => "xe2x99xa3", #U+2663 [♣] black club suit = shamrock
'♥' => "xe2x99xa5", #U+2665 [♥] black heart suit = valentine
'♦' => "xe2x99xa6", #U+2666 [♦] black diamond suit
#Other Special Characters:
'Œ' => "xc5x92", #U+0152 [Œ] Latin capital ligature OE
'œ' => "xc5x93", #U+0153 [œ] Latin small ligature oe
'Š' => "xc5xa0", #U+0160 [Š] Latin capital letter S with caron
'š' => "xc5xa1", #U+0161 [š] Latin small letter s with caron
'Ÿ' => "xc5xb8", #U+0178 [Ÿ] Latin capital letter Y with diaeresis
'ˆ' => "xcbx86", #U+02C6 [ˆ] modifier letter circumflex accent
'˜' => "xcbx9c", #U+02DC [˜] small tilde
' ' => "xe2x80x82", #U+2002 [ ] en space
' ' => "xe2x80x83", #U+2003 [ ] em space
' ' => "xe2x80x89", #U+2009 [ ] thin space
'‌' => "xe2x80x8c", #U+200C [] zero width non-joiner
'‍' => "xe2x80x8d", #U+200D [] zero width joiner
'‎' => "xe2x80x8e", #U+200E [] left-to-right mark
'‏' => "xe2x80x8f", #U+200F [] right-to-left mark
'–' => "xe2x80x93", #U+2013 [–] en dash
'—' => "xe2x80x94", #U+2014 [—] em dash
'‘' => "xe2x80x98", #U+2018 [‘] left single quotation mark
'’' => "xe2x80x99", #U+2019 [’] right single quotation mark (and apostrophe!)
'‚' => "xe2x80x9a", #U+201A [‚] single low-9 quotation mark
'“' => "xe2x80x9c", #U+201C [“] left double quotation mark
'”' => "xe2x80x9d", #U+201D [”] right double quotation mark
'„' => "xe2x80x9e", #U+201E [„] double low-9 quotation mark
'†' => "xe2x80xa0", #U+2020 [†] dagger
'‡' => "xe2x80xa1", #U+2021 [‡] double dagger
'‰' => "xe2x80xb0", #U+2030 [‰] per mille sign
'‹' => "xe2x80xb9", #U+2039 [‹] single left-pointing angle quotation mark
'›' => "xe2x80xba", #U+203A [›] single right-pointing angle quotation mark
'€' => "xe2x82xac", #U+20AC [€] euro sign
);
/**
* This table contains the data on how cp1259 characters map into Unicode (UTF-8).
* The cp1259 map describes standart tatarish cyrillic charset and based on the cp1251 table.
* cp1259 -- this is an outdated one byte encoding of the Tatar language,
* which includes all the Russian letters from cp1251.
*
* @link http://search.cpan.org/CPAN/authors/id/A/AM/AMICHAUER/Lingua-TT-Yanalif-0.08.tar.gz
* @link http://www.unicode.org/charts/PDF/U0400.pdf
*/
public static $cp1259_table = array(
#bytes from 0x00 to 0x7F (ASCII) saved as is
"x80" => "xd3x98", #U+04d8 CYRILLIC CAPITAL LETTER SCHWA
"x81" => "xd0x83", #U+0403 CYRILLIC CAPITAL LETTER GJE
"x82" => "xe2x80x9a", #U+201a SINGLE LOW-9 QUOTATION MARK
"x83" => "xd1x93", #U+0453 CYRILLIC SMALL LETTER GJE
"x84" => "xe2x80x9e", #U+201e DOUBLE LOW-9 QUOTATION MARK
"x85" => "xe2x80xa6", #U+2026 HORIZONTAL ELLIPSIS
"x86" => "xe2x80xa0", #U+2020 DAGGER
"x87" => "xe2x80xa1", #U+2021 DOUBLE DAGGER
"x88" => "xe2x82xac", #U+20ac EURO SIGN
"x89" => "xe2x80xb0", #U+2030 PER MILLE SIGN
"x8a" => "xd3xa8", #U+04e8 CYRILLIC CAPITAL LETTER BARRED O
"x8b" => "xe2x80xb9", #U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
"x8c" => "xd2xae", #U+04ae CYRILLIC CAPITAL LETTER STRAIGHT U
"x8d" => "xd2x96", #U+0496 CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER
"x8e" => "xd2xa2", #U+04a2 CYRILLIC CAPITAL LETTER EN WITH HOOK
"x8f" => "xd2xba", #U+04ba CYRILLIC CAPITAL LETTER SHHA
"x90" => "xd3x99", #U+04d9 CYRILLIC SMALL LETTER SCHWA
"x91" => "xe2x80x98", #U+2018 LEFT SINGLE QUOTATION MARK
"x92" => "xe2x80x99", #U+2019 RIGHT SINGLE QUOTATION MARK
"x93" => "xe2x80x9c", #U+201c LEFT DOUBLE QUOTATION MARK
"x94" => "xe2x80x9d", #U+201d RIGHT DOUBLE QUOTATION MARK
"x95" => "xe2x80xa2", #U+2022 BULLET
"x96" => "xe2x80x93", #U+2013 EN DASH
"x97" => "xe2x80x94", #U+2014 EM DASH
#"x98" #UNDEFINED
"x99" => "xe2x84xa2", #U+2122 TRADE MARK SIGN
"x9a" => "xd3xa9", #U+04e9 CYRILLIC SMALL LETTER BARRED O
"x9b" => "xe2x80xba", #U+203a SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
"x9c" => "xd2xaf", #U+04af CYRILLIC SMALL LETTER STRAIGHT U
"x9d" => "xd2x97", #U+0497 CYRILLIC SMALL LETTER ZHE WITH DESCENDER
"x9e" => "xd2xa3", #U+04a3 CYRILLIC SMALL LETTER EN WITH HOOK
"x9f" => "xd2xbb", #U+04bb CYRILLIC SMALL LETTER SHHA
"xa0" => "xc2xa0", #U+00a0 NO-BREAK SPACE
"xa1" => "xd0x8e", #U+040e CYRILLIC CAPITAL LETTER SHORT U
"xa2" => "xd1x9e", #U+045e CYRILLIC SMALL LETTER SHORT U
"xa3" => "xd0x88", #U+0408 CYRILLIC CAPITAL LETTER JE
"xa4" => "xc2xa4", #U+00a4 CURRENCY SIGN
"xa5" => "xd2x90", #U+0490 CYRILLIC CAPITAL LETTER GHE WITH UPTURN
"xa6" => "xc2xa6", #U+00a6 BROKEN BAR
"xa7" => "xc2xa7", #U+00a7 SECTION SIGN
"xa8" => "xd0x81", #U+0401 CYRILLIC CAPITAL LETTER IO
"xa9" => "xc2xa9", #U+00a9 COPYRIGHT SIGN
"xaa" => "xd0x84", #U+0404 CYRILLIC CAPITAL LETTER UKRAINIAN IE
"xab" => "xc2xab", #U+00ab LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
"xac" => "xc2xac", #U+00ac NOT SIGN
"xad" => "xc2xad", #U+00ad SOFT HYPHEN
"xae" => "xc2xae", #U+00ae REGISTERED SIGN
"xaf" => "xd0x87", #U+0407 CYRILLIC CAPITAL LETTER YI
"xb0" => "xc2xb0", #U+00b0 DEGREE SIGN
"xb1" => "xc2xb1", #U+00b1 PLUS-MINUS SIGN
"xb2" => "xd0x86", #U+0406 CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
"xb3" => "xd1x96", #U+0456 CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
"xb4" => "xd2x91", #U+0491 CYRILLIC SMALL LETTER GHE WITH UPTURN
"xb5" => "xc2xb5", #U+00b5 MICRO SIGN
"xb6" => "xc2xb6", #U+00b6 PILCROW SIGN
"xb7" => "xc2xb7", #U+00b7 MIDDLE DOT
"xb8" => "xd1x91", #U+0451 CYRILLIC SMALL LETTER IO
"xb9" => "xe2x84x96", #U+2116 NUMERO SIGN
"xba" => "xd1x94", #U+0454 CYRILLIC SMALL LETTER UKRAINIAN IE
"xbb" => "xc2xbb", #U+00bb RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
"xbc" => "xd1x98", #U+0458 CYRILLIC SMALL LETTER JE
"xbd" => "xd0x85", #U+0405 CYRILLIC CAPITAL LETTER DZE
"xbe" => "xd1x95", #U+0455 CYRILLIC SMALL LETTER DZE
"xbf" => "xd1x97", #U+0457 CYRILLIC SMALL LETTER YI
"xc0" => "xd0x90", #U+0410 CYRILLIC CAPITAL LETTER A
"xc1" => "xd0x91", #U+0411 CYRILLIC CAPITAL LETTER BE
"xc2" => "xd0x92", #U+0412 CYRILLIC CAPITAL LETTER VE
"xc3" => "xd0x93", #U+0413 CYRILLIC CAPITAL LETTER GHE
"xc4" => "xd0x94", #U+0414 CYRILLIC CAPITAL LETTER DE
"xc5" => "xd0x95", #U+0415 CYRILLIC CAPITAL LETTER IE
"xc6" => "xd0x96", #U+0416 CYRILLIC CAPITAL LETTER ZHE
"xc7" => "xd0x97", #U+0417 CYRILLIC CAPITAL LETTER ZE
"xc8" => "xd0x98", #U+0418 CYRILLIC CAPITAL LETTER I
"xc9" => "xd0x99", #U+0419 CYRILLIC CAPITAL LETTER SHORT I
"xca" => "xd0x9a", #U+041a CYRILLIC CAPITAL LETTER KA
"xcb" => "xd0x9b", #U+041b CYRILLIC CAPITAL LETTER EL
"xcc" => "xd0x9c", #U+041c CYRILLIC CAPITAL LETTER EM
"xcd" => "xd0x9d", #U+041d CYRILLIC CAPITAL LETTER EN
"xce" => "xd0x9e", #U+041e CYRILLIC CAPITAL LETTER O
"xcf" => "xd0x9f", #U+041f CYRILLIC CAPITAL LETTER PE
"xd0" => "xd0xa0", #U+0420 CYRILLIC CAPITAL LETTER ER
"xd1" => "xd0xa1", #U+0421 CYRILLIC CAPITAL LETTER ES
"xd2" => "xd0xa2", #U+0422 CYRILLIC CAPITAL LETTER TE
"xd3" => "xd0xa3", #U+0423 CYRILLIC CAPITAL LETTER U
"xd4" => "xd0xa4", #U+0424 CYRILLIC CAPITAL LETTER EF
"xd5" => "xd0xa5", #U+0425 CYRILLIC CAPITAL LETTER HA
"xd6" => "xd0xa6", #U+0426 CYRILLIC CAPITAL LETTER TSE
"xd7" => "xd0xa7", #U+0427 CYRILLIC CAPITAL LETTER CHE
"xd8" => "xd0xa8", #U+0428 CYRILLIC CAPITAL LETTER SHA
"xd9" => "xd0xa9", #U+0429 CYRILLIC CAPITAL LETTER SHCHA
"xda" => "xd0xaa", #U+042a CYRILLIC CAPITAL LETTER HARD SIGN
"xdb" => "xd0xab", #U+042b CYRILLIC CAPITAL LETTER YERU
"xdc" => "xd0xac", #U+042c CYRILLIC CAPITAL LETTER SOFT SIGN
"xdd" => "xd0xad", #U+042d CYRILLIC CAPITAL LETTER E
"xde" => "xd0xae", #U+042e CYRILLIC CAPITAL LETTER YU
"xdf" => "xd0xaf", #U+042f CYRILLIC CAPITAL LETTER YA
"xe0" => "xd0xb0", #U+0430 CYRILLIC SMALL LETTER A
"xe1" => "xd0xb1", #U+0431 CYRILLIC SMALL LETTER BE
"xe2" => "xd0xb2", #U+0432 CYRILLIC SMALL LETTER VE
"xe3" => "xd0xb3", #U+0433 CYRILLIC SMALL LETTER GHE
"xe4" => "xd0xb4", #U+0434 CYRILLIC SMALL LETTER DE
"xe5" => "xd0xb5", #U+0435 CYRILLIC SMALL LETTER IE
"xe6" => "xd0xb6", #U+0436 CYRILLIC SMALL LETTER ZHE
"xe7" => "xd0xb7", #U+0437 CYRILLIC SMALL LETTER ZE
"xe8" => "xd0xb8", #U+0438 CYRILLIC SMALL LETTER I
"xe9" => "xd0xb9", #U+0439 CYRILLIC SMALL LETTER SHORT I
"xea" => "xd0xba", #U+043a CYRILLIC SMALL LETTER KA
"xeb" => "xd0xbb", #U+043b CYRILLIC SMALL LETTER EL
"xec" => "xd0xbc", #U+043c CYRILLIC SMALL LETTER EM
"xed" => "xd0xbd", #U+043d CYRILLIC SMALL LETTER EN
"xee" => "xd0xbe", #U+043e CYRILLIC SMALL LETTER O
"xef" => "xd0xbf", #U+043f CYRILLIC SMALL LETTER PE
"xf0" => "xd1x80", #U+0440 CYRILLIC SMALL LETTER ER
"xf1" => "xd1x81", #U+0441 CYRILLIC SMALL LETTER ES
"xf2" => "xd1x82", #U+0442 CYRILLIC SMALL LETTER TE
"xf3" => "xd1x83", #U+0443 CYRILLIC SMALL LETTER U
"xf4" => "xd1x84", #U+0444 CYRILLIC SMALL LETTER EF
"xf5" => "xd1x85", #U+0445 CYRILLIC SMALL LETTER HA
"xf6" => "xd1x86", #U+0446 CYRILLIC SMALL LETTER TSE
"xf7" => "xd1x87", #U+0447 CYRILLIC SMALL LETTER CHE
"xf8" => "xd1x88", #U+0448 CYRILLIC SMALL LETTER SHA
"xf9" => "xd1x89", #U+0449 CYRILLIC SMALL LETTER SHCHA
"xfa" => "xd1x8a", #U+044a CYRILLIC SMALL LETTER HARD SIGN
"xfb" => "xd1x8b", #U+044b CYRILLIC SMALL LETTER YERU
"xfc" => "xd1x8c", #U+044c CYRILLIC SMALL LETTER SOFT SIGN
"xfd" => "xd1x8d", #U+044d CYRILLIC SMALL LETTER E
"xfe" => "xd1x8e", #U+044e CYRILLIC SMALL LETTER YU
"xff" => "xd1x8f", #U+044f CYRILLIC SMALL LETTER YA
);
/**
* UTF-8 Case lookup table
*
* This lookuptable defines the upper case letters to their correspponding
* lower case letter in UTF-8
*
* @author Andreas Gohr <andi@splitbrain.org>
*/
public static $convert_case_table = array(
#CASE_UPPER => case_lower
"x41" => "x61", #A a
"x42" => "x62", #B b
"x43" => "x63", #C c
"x44" => "x64", #D d
"x45" => "x65", #E e
"x46" => "x66", #F f
"x47" => "x67", #G g
"x48" => "x68", #H h
"x49" => "x69", #I i
"x4a" => "x6a", #J j
"x4b" => "x6b", #K k
"x4c" => "x6c", #L l
"x4d" => "x6d", #M m
"x4e" => "x6e", #N n
"x4f" => "x6f", #O o
"x50" => "x70", #P p
"x51" => "x71", #Q q
"x52" => "x72", #R r
"x53" => "x73", #S s
"x54" => "x74", #T t
"x55" => "x75", #U u
"x56" => "x76", #V v
"x57" => "x77", #W w
"x58" => "x78", #X x
"x59" => "x79", #Y y
"x5a" => "x7a", #Z z
"xc3x80" => "xc3xa0",
"xc3x81" => "xc3xa1",
"xc3x82" => "xc3xa2",
"xc3x83" => "xc3xa3",
"xc3x84" => "xc3xa4",
"xc3x85" => "xc3xa5",
"xc3x86" => "xc3xa6",
"xc3x87" => "xc3xa7",
"xc3x88" => "xc3xa8",
"xc3x89" => "xc3xa9",
"xc3x8a" => "xc3xaa",
"xc3x8b" => "xc3xab",
"xc3x8c" => "xc3xac",
"xc3x8d" => "xc3xad",
"xc3x8e" => "xc3xae",
"xc3x8f" => "xc3xaf",
"xc3x90" => "xc3xb0",
"xc3x91" => "xc3xb1",
"xc3x92" => "xc3xb2",
"xc3x93" => "xc3xb3",
"xc3x94" => "xc3xb4",
"xc3x95" => "xc3xb5",
"xc3x96" => "xc3xb6",
"xc3x98" => "xc3xb8",
"xc3x99" => "xc3xb9",
"xc3x9a" => "xc3xba",
"xc3x9b" => "xc3xbb",
"xc3x9c" => "xc3xbc",
"xc3x9d" => "xc3xbd",
"xc3x9e" => "xc3xbe",
"xc4x80" => "xc4x81",
"xc4x82" => "xc4x83",
"xc4x84" => "xc4x85",
"xc4x86" => "xc4x87",
"xc4x88" => "xc4x89",
"xc4x8a" => "xc4x8b",
"xc4x8c" => "xc4x8d",
"xc4x8e" => "xc4x8f",
"xc4x90" => "xc4x91",
"xc4x92" => "xc4x93",
"xc4x94" => "xc4x95",
"xc4x96" => "xc4x97",
"xc4x98" => "xc4x99",
"xc4x9a" => "xc4x9b",
"xc4x9c" => "xc4x9d",
"xc4x9e" => "xc4x9f",
"xc4xa0" => "xc4xa1",
"xc4xa2" => "xc4xa3",
"xc4xa4" => "xc4xa5",
"xc4xa6" => "xc4xa7",
"xc4xa8" => "xc4xa9",
"xc4xaa" => "xc4xab",
"xc4xac" => "xc4xad",
"xc4xae" => "xc4xaf",
"xc4xb2" => "xc4xb3",
"xc4xb4" => "xc4xb5",
"xc4xb6" => "xc4xb7",
"xc4xb9" => "xc4xba",
"xc4xbb" => "xc4xbc",
"xc4xbd" => "xc4xbe",
"xc4xbf" => "xc5x80",
"xc5x81" => "xc5x82",
"xc5x83" => "xc5x84",
"xc5x85" => "xc5x86",
"xc5x87" => "xc5x88",
"xc5x8a" => "xc5x8b",
"xc5x8c" => "xc5x8d",
"xc5x8e" => "xc5x8f",
"xc5x90" => "xc5x91",
"xc5x92" => "xc5x93",
"xc5x94" => "xc5x95",
"xc5x96" => "xc5x97",
"xc5x98" => "xc5x99",
"xc5x9a" => "xc5x9b",
"xc5x9c" => "xc5x9d",
"xc5x9e" => "xc5x9f",
"xc5xa0" => "xc5xa1",
"xc5xa2" => "xc5xa3",
"xc5xa4" => "xc5xa5",
"xc5xa6" => "xc5xa7",
"xc5xa8" => "xc5xa9",
"xc5xaa" => "xc5xab",
"xc5xac" => "xc5xad",
"xc5xae" => "xc5xaf",
"xc5xb0" => "xc5xb1",
"xc5xb2" => "xc5xb3",
"xc5xb4" => "xc5xb5",
"xc5xb6" => "xc5xb7",
"xc5xb8" => "xc3xbf",
"xc5xb9" => "xc5xba",
"xc5xbb" => "xc5xbc",
"xc5xbd" => "xc5xbe",
"xc6x81" => "xc9x93",
"xc6x82" => "xc6x83",
"xc6x84" => "xc6x85",
"xc6x86" => "xc9x94",
"xc6x87" => "xc6x88",
"xc6x89" => "xc9x96",
"xc6x8a" => "xc9x97",
"xc6x8b" => "xc6x8c",
"xc6x8e" => "xc7x9d",
"xc6x8f" => "xc9x99",
"xc6x90" => "xc9x9b",
"xc6x91" => "xc6x92",
"xc6x94" => "xc9xa3",
"xc6x96" => "xc9xa9",
"xc6x97" => "xc9xa8",
"xc6x98" => "xc6x99",
"xc6x9c" => "xc9xaf",
"xc6x9d" => "xc9xb2",
"xc6x9f" => "xc9xb5",
"xc6xa0" => "xc6xa1",
"xc6xa2" => "xc6xa3",
"xc6xa4" => "xc6xa5",
"xc6xa6" => "xcax80",
"xc6xa7" => "xc6xa8",
"xc6xa9" => "xcax83",
"xc6xac" => "xc6xad",
"xc6xae" => "xcax88",
"xc6xaf" => "xc6xb0",
"xc6xb1" => "xcax8a",
"xc6xb2" => "xcax8b",
"xc6xb3" => "xc6xb4",
"xc6xb5" => "xc6xb6",
"xc6xb7" => "xcax92",
"xc6xb8" => "xc6xb9",
"xc6xbc" => "xc6xbd",
"xc7x85" => "xc7x86",
"xc7x88" => "xc7x89",
"xc7x8b" => "xc7x8c",
"xc7x8d" => "xc7x8e",
"xc7x8f" => "xc7x90",
"xc7x91" => "xc7x92",
"xc7x93" => "xc7x94",
"xc7x95" => "xc7x96",
"xc7x97" => "xc7x98",
"xc7x99" => "xc7x9a",
"xc7x9b" => "xc7x9c",
"xc7x9e" => "xc7x9f",
"xc7xa0" => "xc7xa1",
"xc7xa2" => "xc7xa3",
"xc7xa4" => "xc7xa5",
"xc7xa6" => "xc7xa7",
"xc7xa8" => "xc7xa9",
"xc7xaa" => "xc7xab",
"xc7xac" => "xc7xad",
"xc7xae" => "xc7xaf",
"xc7xb2" => "xc7xb3",
"xc7xb4" => "xc7xb5",
"xc7xb6" => "xc6x95",
"xc7xb7" => "xc6xbf",
"xc7xb8" => "xc7xb9",
"xc7xba" => "xc7xbb",
"xc7xbc" => "xc7xbd",
"xc7xbe" => "xc7xbf",
"xc8x80" => "xc8x81",
"xc8x82" => "xc8x83",
"xc8x84" => "xc8x85",
"xc8x86" => "xc8x87",
"xc8x88" => "xc8x89",
"xc8x8a" => "xc8x8b",
"xc8x8c" => "xc8x8d",
"xc8x8e" => "xc8x8f",
"xc8x90" => "xc8x91",
"xc8x92" => "xc8x93",
"xc8x94" => "xc8x95",
"xc8x96" => "xc8x97",
"xc8x98" => "xc8x99",
"xc8x9a" => "xc8x9b",
"xc8x9c" => "xc8x9d",
"xc8x9e" => "xc8x9f",
"xc8xa0" => "xc6x9e",
"xc8xa2" => "xc8xa3",
"xc8xa4" => "xc8xa5",
"xc8xa6" => "xc8xa7",
"xc8xa8" => "xc8xa9",
"xc8xaa" => "xc8xab",
"xc8xac" => "xc8xad",
"xc8xae" => "xc8xaf",
"xc8xb0" => "xc8xb1",
"xc8xb2" => "xc8xb3",
"xcex86" => "xcexac",
"xcex88" => "xcexad",
"xcex89" => "xcexae",
"xcex8a" => "xcexaf",
"xcex8c" => "xcfx8c",
"xcex8e" => "xcfx8d",
"xcex8f" => "xcfx8e",
"xcex91" => "xcexb1",
"xcex92" => "xcexb2",
"xcex93" => "xcexb3",
"xcex94" => "xcexb4",
"xcex95" => "xcexb5",
"xcex96" => "xcexb6",
"xcex97" => "xcexb7",
"xcex98" => "xcexb8",
"xcex99" => "xcexb9",
"xcex9a" => "xcexba",
"xcex9b" => "xcexbb",
"xcex9c" => "xc2xb5",
"xcex9d" => "xcexbd",
"xcex9e" => "xcexbe",
"xcex9f" => "xcexbf",
"xcexa0" => "xcfx80",
"xcexa1" => "xcfx81",
"xcexa3" => "xcfx82",
"xcexa4" => "xcfx84",
"xcexa5" => "xcfx85",
"xcexa6" => "xcfx86",
"xcexa7" => "xcfx87",
"xcexa8" => "xcfx88",
"xcexa9" => "xcfx89",
"xcexaa" => "xcfx8a",
"xcexab" => "xcfx8b",
"xcfx98" => "xcfx99",
"xcfx9a" => "xcfx9b",
"xcfx9c" => "xcfx9d",
"xcfx9e" => "xcfx9f",
"xcfxa0" => "xcfxa1",
"xcfxa2" => "xcfxa3",
"xcfxa4" => "xcfxa5",
"xcfxa6" => "xcfxa7",
"xcfxa8" => "xcfxa9",
"xcfxaa" => "xcfxab",
"xcfxac" => "xcfxad",
"xcfxae" => "xcfxaf",
"xd0x80" => "xd1x90",
"xd0x81" => "xd1x91",
"xd0x82" => "xd1x92",
"xd0x83" => "xd1x93",
"xd0x84" => "xd1x94",
"xd0x85" => "xd1x95",
"xd0x86" => "xd1x96",
"xd0x87" => "xd1x97",
"xd0x88" => "xd1x98",
"xd0x89" => "xd1x99",
"xd0x8a" => "xd1x9a",
"xd0x8b" => "xd1x9b",
"xd0x8c" => "xd1x9c",
"xd0x8d" => "xd1x9d",
"xd0x8e" => "xd1x9e",
"xd0x8f" => "xd1x9f",
"xd0x90" => "xd0xb0",
"xd0x91" => "xd0xb1",
"xd0x92" => "xd0xb2",
"xd0x93" => "xd0xb3",
"xd0x94" => "xd0xb4",
"xd0x95" => "xd0xb5",
"xd0x96" => "xd0xb6",
"xd0x97" => "xd0xb7",
"xd0x98" => "xd0xb8",
"xd0x99" => "xd0xb9",
"xd0x9a" => "xd0xba",
"xd0x9b" => "xd0xbb",
"xd0x9c" => "xd0xbc",
"xd0x9d" => "xd0xbd",
"xd0x9e" => "xd0xbe",
"xd0x9f" => "xd0xbf",
"xd0xa0" => "xd1x80",
"xd0xa1" => "xd1x81",
"xd0xa2" => "xd1x82",
"xd0xa3" => "xd1x83",
"xd0xa4" => "xd1x84",
"xd0xa5" => "xd1x85",
"xd0xa6" => "xd1x86",
"xd0xa7" => "xd1x87",
"xd0xa8" => "xd1x88",
"xd0xa9" => "xd1x89",
"xd0xaa" => "xd1x8a",
"xd0xab" => "xd1x8b",
"xd0xac" => "xd1x8c",
"xd0xad" => "xd1x8d",
"xd0xae" => "xd1x8e",
"xd0xaf" => "xd1x8f",
"xd1xa0" => "xd1xa1",
"xd1xa2" => "xd1xa3",
"xd1xa4" => "xd1xa5",
"xd1xa6" => "xd1xa7",
"xd1xa8" => "xd1xa9",
"xd1xaa" => "xd1xab",
"xd1xac" => "xd1xad",
"xd1xae" => "xd1xaf",
"xd1xb0" => "xd1xb1",
"xd1xb2" => "xd1xb3",
"xd1xb4" => "xd1xb5",
"xd1xb6" => "xd1xb7",
"xd1xb8" => "xd1xb9",
"xd1xba" => "xd1xbb",
"xd1xbc" => "xd1xbd",
"xd1xbe" => "xd1xbf",
"xd2x80" => "xd2x81",
"xd2x8a" => "xd2x8b",
"xd2x8c" => "xd2x8d",
"xd2x8e" => "xd2x8f",
"xd2x90" => "xd2x91",
"xd2x92" => "xd2x93",
"xd2x94" => "xd2x95",
"xd2x96" => "xd2x97",
"xd2x98" => "xd2x99",
"xd2x9a" => "xd2x9b",
"xd2x9c" => "xd2x9d",
"xd2x9e" => "xd2x9f",
"xd2xa0" => "xd2xa1",
"xd2xa2" => "xd2xa3",
"xd2xa4" => "xd2xa5",
"xd2xa6" => "xd2xa7",
"xd2xa8" => "xd2xa9",
"xd2xaa" => "xd2xab",
"xd2xac" => "xd2xad",
"xd2xae" => "xd2xaf",
"xd2xb0" => "xd2xb1",
"xd2xb2" => "xd2xb3",
"xd2xb4" => "xd2xb5",
"xd2xb6" => "xd2xb7",
"xd2xb8" => "xd2xb9",
"xd2xba" => "xd2xbb",
"xd2xbc" => "xd2xbd",
"xd2xbe" => "xd2xbf",
"xd3x81" => "xd3x82",
"xd3x83" => "xd3x84",
"xd3x85" => "xd3x86",
"xd3x87" => "xd3x88",
"xd3x89" => "xd3x8a",
"xd3x8b" => "xd3x8c",
"xd3x8d" => "xd3x8e",
"xd3x90" => "xd3x91",
"xd3x92" => "xd3x93",
"xd3x94" => "xd3x95",
"xd3x96" => "xd3x97",
"xd3x98" => "xd3x99",
"xd3x9a" => "xd3x9b",
"xd3x9c" => "xd3x9d",
"xd3x9e" => "xd3x9f",
"xd3xa0" => "xd3xa1",
"xd3xa2" => "xd3xa3",
"xd3xa4" => "xd3xa5",
"xd3xa6" => "xd3xa7",
"xd3xa8" => "xd3xa9",
"xd3xaa" => "xd3xab",
"xd3xac" => "xd3xad",
"xd3xae" => "xd3xaf",
"xd3xb0" => "xd3xb1",
"xd3xb2" => "xd3xb3",
"xd3xb4" => "xd3xb5",
"xd3xb8" => "xd3xb9",
"xd4x80" => "xd4x81",
"xd4x82" => "xd4x83",
"xd4x84" => "xd4x85",
"xd4x86" => "xd4x87",
"xd4x88" => "xd4x89",
"xd4x8a" => "xd4x8b",
"xd4x8c" => "xd4x8d",
"xd4x8e" => "xd4x8f",
"xd4xb1" => "xd5xa1",
"xd4xb2" => "xd5xa2",
"xd4xb3" => "xd5xa3",
"xd4xb4" => "xd5xa4",
"xd4xb5" => "xd5xa5",
"xd4xb6" => "xd5xa6",
"xd4xb7" => "xd5xa7",
"xd4xb8" => "xd5xa8",
"xd4xb9" => "xd5xa9",
"xd4xba" => "xd5xaa",
"xd4xbb" => "xd5xab",
"xd4xbc" => "xd5xac",
"xd4xbd" => "xd5xad",
"xd4xbe" => "xd5xae",
"xd4xbf" => "xd5xaf",
"xd5x80" => "xd5xb0",
"xd5x81" => "xd5xb1",
"xd5x82" => "xd5xb2",
"xd5x83" => "xd5xb3",
"xd5x84" => "xd5xb4",
"xd5x85" => "xd5xb5",
"xd5x86" => "xd5xb6",
"xd5x87" => "xd5xb7",
"xd5x88" => "xd5xb8",
"xd5x89" => "xd5xb9",
"xd5x8a" => "xd5xba",
"xd5x8b" => "xd5xbb",
"xd5x8c" => "xd5xbc",
"xd5x8d" => "xd5xbd",
"xd5x8e" => "xd5xbe",
"xd5x8f" => "xd5xbf",
"xd5x90" => "xd6x80",
"xd5x91" => "xd6x81",
"xd5x92" => "xd6x82",
"xd5x93" => "xd6x83",
"xd5x94" => "xd6x84",
"xd5x95" => "xd6x85",
"xd5x96" => "xd6x86",
"xe1xb8x80" => "xe1xb8x81",
"xe1xb8x82" => "xe1xb8x83",
"xe1xb8x84" => "xe1xb8x85",
"xe1xb8x86" => "xe1xb8x87",
"xe1xb8x88" => "xe1xb8x89",
"xe1xb8x8a" => "xe1xb8x8b",
"xe1xb8x8c" => "xe1xb8x8d",
"xe1xb8x8e" => "xe1xb8x8f",
"xe1xb8x90" => "xe1xb8x91",
"xe1xb8x92" => "xe1xb8x93",
"xe1xb8x94" => "xe1xb8x95",
"xe1xb8x96" => "xe1xb8x97",
"xe1xb8x98" => "xe1xb8x99",
"xe1xb8x9a" => "xe1xb8x9b",
"xe1xb8x9c" => "xe1xb8x9d",
"xe1xb8x9e" => "xe1xb8x9f",
"xe1xb8xa0" => "xe1xb8xa1",
"xe1xb8xa2" => "xe1xb8xa3",
"xe1xb8xa4" => "xe1xb8xa5",
"xe1xb8xa6" => "xe1xb8xa7",
"xe1xb8xa8" => "xe1xb8xa9",
"xe1xb8xaa" => "xe1xb8xab",
"xe1xb8xac" => "xe1xb8xad",
"xe1xb8xae" => "xe1xb8xaf",
"xe1xb8xb0" => "xe1xb8xb1",
"xe1xb8xb2" => "xe1xb8xb3",
"xe1xb8xb4" => "xe1xb8xb5",
"xe1xb8xb6" => "xe1xb8xb7",
"xe1xb8xb8" => "xe1xb8xb9",
"xe1xb8xba" => "xe1xb8xbb",
"xe1xb8xbc" => "xe1xb8xbd",
"xe1xb8xbe" => "xe1xb8xbf",
"xe1xb9x80" => "xe1xb9x81",
"xe1xb9x82" => "xe1xb9x83",
"xe1xb9x84" => "xe1xb9x85",
"xe1xb9x86" => "xe1xb9x87",
"xe1xb9x88" => "xe1xb9x89",
"xe1xb9x8a" => "xe1xb9x8b",
"xe1xb9x8c" => "xe1xb9x8d",
"xe1xb9x8e" => "xe1xb9x8f",
"xe1xb9x90" => "xe1xb9x91",
"xe1xb9x92" => "xe1xb9x93",
"xe1xb9x94" => "xe1xb9x95",
"xe1xb9x96" => "xe1xb9x97",
"xe1xb9x98" => "xe1xb9x99",
"xe1xb9x9a" => "xe1xb9x9b",
"xe1xb9x9c" => "xe1xb9x9d",
"xe1xb9x9e" => "xe1xb9x9f",
"xe1xb9xa0" => "xe1xb9xa1",
"xe1xb9xa2" => "xe1xb9xa3",
"xe1xb9xa4" => "xe1xb9xa5",
"xe1xb9xa6" => "xe1xb9xa7",
"xe1xb9xa8" => "xe1xb9xa9",
"xe1xb9xaa" => "xe1xb9xab",
"xe1xb9xac" => "xe1xb9xad",
"xe1xb9xae" => "xe1xb9xaf",
"xe1xb9xb0" => "xe1xb9xb1",
"xe1xb9xb2" => "xe1xb9xb3",
"xe1xb9xb4" => "xe1xb9xb5",
"xe1xb9xb6" => "xe1xb9xb7",
"xe1xb9xb8" => "xe1xb9xb9",
"xe1xb9xba" => "xe1xb9xbb",
"xe1xb9xbc" => "xe1xb9xbd",
"xe1xb9xbe" => "xe1xb9xbf",
"xe1xbax80" => "xe1xbax81",
"xe1xbax82" => "xe1xbax83",
"xe1xbax84" => "xe1xbax85",
"xe1xbax86" => "xe1xbax87",
"xe1xbax88" => "xe1xbax89",
"xe1xbax8a" => "xe1xbax8b",
"xe1xbax8c" => "xe1xbax8d",
"xe1xbax8e" => "xe1xbax8f",
"xe1xbax90" => "xe1xbax91",
"xe1xbax92" => "xe1xbax93",
"xe1xbax94" => "xe1xbax95",
"xe1xbaxa0" => "xe1xbaxa1",
"xe1xbaxa2" => "xe1xbaxa3",
"xe1xbaxa4" => "xe1xbaxa5",
"xe1xbaxa6" => "xe1xbaxa7",
"xe1xbaxa8" => "xe1xbaxa9",
"xe1xbaxaa" => "xe1xbaxab",
"xe1xbaxac" => "xe1xbaxad",
"xe1xbaxae" => "xe1xbaxaf",
"xe1xbaxb0" => "xe1xbaxb1",
"xe1xbaxb2" => "xe1xbaxb3",
"xe1xbaxb4" => "xe1xbaxb5",
"xe1xbaxb6" => "xe1xbaxb7",
"xe1xbaxb8" => "xe1xbaxb9",
"xe1xbaxba" => "xe1xbaxbb",
"xe1xbaxbc" => "xe1xbaxbd",
"xe1xbaxbe" => "xe1xbaxbf",
"xe1xbbx80" => "xe1xbbx81",
"xe1xbbx82" => "xe1xbbx83",
"xe1xbbx84" => "xe1xbbx85",
"xe1xbbx86" => "xe1xbbx87",
"xe1xbbx88" => "xe1xbbx89",
"xe1xbbx8a" => "xe1xbbx8b",
"xe1xbbx8c" => "xe1xbbx8d",
"xe1xbbx8e" => "xe1xbbx8f",
"xe1xbbx90" => "xe1xbbx91",
"xe1xbbx92" => "xe1xbbx93",
"xe1xbbx94" => "xe1xbbx95",
"xe1xbbx96" => "xe1xbbx97",
"xe1xbbx98" => "xe1xbbx99",
"xe1xbbx9a" => "xe1xbbx9b",
"xe1xbbx9c" => "xe1xbbx9d",
"xe1xbbx9e" => "xe1xbbx9f",
"xe1xbbxa0" => "xe1xbbxa1",
"xe1xbbxa2" => "xe1xbbxa3",
"xe1xbbxa4" => "xe1xbbxa5",
"xe1xbbxa6" => "xe1xbbxa7",
"xe1xbbxa8" => "xe1xbbxa9",
"xe1xbbxaa" => "xe1xbbxab",
"xe1xbbxac" => "xe1xbbxad",
"xe1xbbxae" => "xe1xbbxaf",
"xe1xbbxb0" => "xe1xbbxb1",
"xe1xbbxb2" => "xe1xbbxb3",
"xe1xbbxb4" => "xe1xbbxb5",
"xe1xbbxb6" => "xe1xbbxb7",
"xe1xbbxb8" => "xe1xbbxb9",
"xe1xbcx88" => "xe1xbcx80",
"xe1xbcx89" => "xe1xbcx81",
"xe1xbcx8a" => "xe1xbcx82",
"xe1xbcx8b" => "xe1xbcx83",
"xe1xbcx8c" => "xe1xbcx84",
"xe1xbcx8d" => "xe1xbcx85",
"xe1xbcx8e" => "xe1xbcx86",
"xe1xbcx8f" => "xe1xbcx87",
"xe1xbcx98" => "xe1xbcx90",
"xe1xbcx99" => "xe1xbcx91",
"xe1xbcx9a" => "xe1xbcx92",
"xe1xbcx9b" => "xe1xbcx93",
"xe1xbcx9c" => "xe1xbcx94",
"xe1xbcx9d" => "xe1xbcx95",
"xe1xbcxa9" => "xe1xbcxa1",
"xe1xbcxaa" => "xe1xbcxa2",
"xe1xbcxab" => "xe1xbcxa3",
"xe1xbcxac" => "xe1xbcxa4",
"xe1xbcxad" => "xe1xbcxa5",
"xe1xbcxae" => "xe1xbcxa6",
"xe1xbcxaf" => "xe1xbcxa7",
"xe1xbcxb8" => "xe1xbcxb0",
"xe1xbcxb9" => "xe1xbcxb1",
"xe1xbcxba" => "xe1xbcxb2",
"xe1xbcxbb" => "xe1xbcxb3",
"xe1xbcxbc" => "xe1xbcxb4",
"xe1xbcxbd" => "xe1xbcxb5",
"xe1xbcxbe" => "xe1xbcxb6",
"xe1xbcxbf" => "xe1xbcxb7",
"xe1xbdx88" => "xe1xbdx80",
"xe1xbdx89" => "xe1xbdx81",
"xe1xbdx8a" => "xe1xbdx82",
"xe1xbdx8b" => "xe1xbdx83",
"xe1xbdx8c" => "xe1xbdx84",
"xe1xbdx8d" => "xe1xbdx85",
"xe1xbdx99" => "xe1xbdx91",
"xe1xbdx9b" => "xe1xbdx93",
"xe1xbdx9d" => "xe1xbdx95",
"xe1xbdx9f" => "xe1xbdx97",
"xe1xbdxa9" => "xe1xbdxa1",
"xe1xbdxaa" => "xe1xbdxa2",
"xe1xbdxab" => "xe1xbdxa3",
"xe1xbdxac" => "xe1xbdxa4",
"xe1xbdxad" => "xe1xbdxa5",
"xe1xbdxae" => "xe1xbdxa6",
"xe1xbdxaf" => "xe1xbdxa7",
"xe1xbex88" => "xe1xbex80",
"xe1xbex89" => "xe1xbex81",
"xe1xbex8a" => "xe1xbex82",
"xe1xbex8b" => "xe1xbex83",
"xe1xbex8c" => "xe1xbex84",
"xe1xbex8d" => "xe1xbex85",
"xe1xbex8e" => "xe1xbex86",
"xe1xbex8f" => "xe1xbex87",
"xe1xbex98" => "xe1xbex90",
"xe1xbex99" => "xe1xbex91",
"xe1xbex9a" => "xe1xbex92",
"xe1xbex9b" => "xe1xbex93",
"xe1xbex9c" => "xe1xbex94",
"xe1xbex9d" => "xe1xbex95",
"xe1xbex9e" => "xe1xbex96",
"xe1xbex9f" => "xe1xbex97",
"xe1xbexa9" => "xe1xbexa1",
"xe1xbexaa" => "xe1xbexa2",
"xe1xbexab" => "xe1xbexa3",
"xe1xbexac" => "xe1xbexa4",
"xe1xbexad" => "xe1xbexa5",
"xe1xbexae" => "xe1xbexa6",
"xe1xbexaf" => "xe1xbexa7",
"xe1xbexb8" => "xe1xbexb0",
"xe1xbexb9" => "xe1xbexb1",
"xe1xbexba" => "xe1xbdxb0",
"xe1xbexbb" => "xe1xbdxb1",
"xe1xbexbc" => "xe1xbexb3",
"xe1xbfx88" => "xe1xbdxb2",
"xe1xbfx89" => "xe1xbdxb3",
"xe1xbfx8a" => "xe1xbdxb4",
"xe1xbfx8b" => "xe1xbdxb5",
"xe1xbfx8c" => "xe1xbfx83",
"xe1xbfx98" => "xe1xbfx90",
"xe1xbfx99" => "xe1xbfx91",
"xe1xbfx9a" => "xe1xbdxb6",
"xe1xbfx9b" => "xe1xbdxb7",
"xe1xbfxa9" => "xe1xbfxa1",
"xe1xbfxaa" => "xe1xbdxba",
"xe1xbfxab" => "xe1xbdxbb",
"xe1xbfxac" => "xe1xbfxa5",
"xe1xbfxb8" => "xe1xbdxb8",
"xe1xbfxb9" => "xe1xbdxb9",
"xe1xbfxba" => "xe1xbdxbc",
"xe1xbfxbb" => "xe1xbdxbd",
"xe1xbfxbc" => "xe1xbfxb3",
"xefxbcxa1" => "xefxbdx81",
"xefxbcxa2" => "xefxbdx82",
"xefxbcxa3" => "xefxbdx83",
"xefxbcxa4" => "xefxbdx84",
"xefxbcxa5" => "xefxbdx85",
"xefxbcxa6" => "xefxbdx86",
"xefxbcxa7" => "xefxbdx87",
"xefxbcxa8" => "xefxbdx88",
"xefxbcxa9" => "xefxbdx89",
"xefxbcxaa" => "xefxbdx8a",
"xefxbcxab" => "xefxbdx8b",
"xefxbcxac" => "xefxbdx8c",
"xefxbcxad" => "xefxbdx8d",
"xefxbcxae" => "xefxbdx8e",
"xefxbcxaf" => "xefxbdx8f",
"xefxbcxb0" => "xefxbdx90",
"xefxbcxb1" => "xefxbdx91",
"xefxbcxb2" => "xefxbdx92",
"xefxbcxb3" => "xefxbdx93",
"xefxbcxb4" => "xefxbdx94",
"xefxbcxb5" => "xefxbdx95",
"xefxbcxb6" => "xefxbdx96",
"xefxbcxb7" => "xefxbdx97",
"xefxbcxb8" => "xefxbdx98",
"xefxbcxb9" => "xefxbdx99",
"xefxbcxba" => "xefxbdx9a",
);
#Unicode Character Database 6.0.0 (2010-06-04)
#autogenerated by unicode_blocks_txt2php() PHP function at 2011-06-04 00:19:39, 209 blocks total
public static $unicode_blocks = array(
'Basic Latin' => array(
0 => 0x0000,
1 => 0x007F,
2 => 0,
),
'Latin-1 Supplement' => array(
0 => 0x0080,
1 => 0x00FF,
2 => 1,
),
'Latin Extended-A' => array(
0 => 0x0100,
1 => 0x017F,
2 => 2,
),
'Latin Extended-B' => array(
0 => 0x0180,
1 => 0x024F,
2 => 3,
),
'IPA Extensions' => array(
0 => 0x0250,
1 => 0x02AF,
2 => 4,
),
'Spacing Modifier Letters' => array(
0 => 0x02B0,
1 => 0x02FF,
2 => 5,
),
'Combining Diacritical Marks' => array(
0 => 0x0300,
1 => 0x036F,
2 => 6,
),
'Greek and Coptic' => array(
0 => 0x0370,
1 => 0x03FF,
2 => 7,
),
'Cyrillic' => array(
0 => 0x0400,
1 => 0x04FF,
2 => 8,
),
'Cyrillic Supplement' => array(
0 => 0x0500,
1 => 0x052F,
2 => 9,
),
'Armenian' => array(
0 => 0x0530,
1 => 0x058F,
2 => 10,
),
'Hebrew' => array(
0 => 0x0590,
1 => 0x05FF,
2 => 11,
),
'Arabic' => array(
0 => 0x0600,
1 => 0x06FF,
2 => 12,
),
'Syriac' => array(
0 => 0x0700,
1 => 0x074F,
2 => 13,
),
'Arabic Supplement' => array(
0 => 0x0750,
1 => 0x077F,
2 => 14,
),
'Thaana' => array(
0 => 0x0780,
1 => 0x07BF,
2 => 15,
),
'NKo' => array(
0 => 0x07C0,
1 => 0x07FF,
2 => 16,
),
'Samaritan' => array(
0 => 0x0800,
1 => 0x083F,
2 => 17,
),
'Mandaic' => array(
0 => 0x0840,
1 => 0x085F,
2 => 18,
),
'Devanagari' => array(
0 => 0x0900,
1 => 0x097F,
2 => 19,
),
'Bengali' => array(
0 => 0x0980,
1 => 0x09FF,
2 => 20,
),
'Gurmukhi' => array(
0 => 0x0A00,
1 => 0x0A7F,
2 => 21,
),
'Gujarati' => array(
0 => 0x0A80,
1 => 0x0AFF,
2 => 22,
),
'Oriya' => array(
0 => 0x0B00,
1 => 0x0B7F,
2 => 23,
),
'Tamil' => array(
0 => 0x0B80,
1 => 0x0BFF,
2 => 24,
),
'Telugu' => array(
0 => 0x0C00,
1 => 0x0C7F,
2 => 25,
),
'Kannada' => array(
0 => 0x0C80,
1 => 0x0CFF,
2 => 26,
),
'Malayalam' => array(
0 => 0x0D00,
1 => 0x0D7F,
2 => 27,
),
'Sinhala' => array(
0 => 0x0D80,
1 => 0x0DFF,
2 => 28,
),
'Thai' => array(
0 => 0x0E00,
1 => 0x0E7F,
2 => 29,
),
'Lao' => array(
0 => 0x0E80,
1 => 0x0EFF,
2 => 30,
),
'Tibetan' => array(
0 => 0x0F00,
1 => 0x0FFF,
2 => 31,
),
'Myanmar' => array(
0 => 0x1000,
1 => 0x109F,
2 => 32,
),
'Georgian' => array(
0 => 0x10A0,
1 => 0x10FF,
2 => 33,
),
'Hangul Jamo' => array(
0 => 0x1100,
1 => 0x11FF,
2 => 34,
),
'Ethiopic' => array(
0 => 0x1200,
1 => 0x137F,
2 => 35,
),
'Ethiopic Supplement' => array(
0 => 0x1380,
1 => 0x139F,
2 => 36,
),
'Cherokee' => array(
0 => 0x13A0,
1 => 0x13FF,
2 => 37,
),
'Unified Canadian Aboriginal Syllabics' => array(
0 => 0x1400,
1 => 0x167F,
2 => 38,
),
'Ogham' => array(
0 => 0x1680,
1 => 0x169F,
2 => 39,
),
'Runic' => array(
0 => 0x16A0,
1 => 0x16FF,
2 => 40,
),
'Tagalog' => array(
0 => 0x1700,
1 => 0x171F,
2 => 41,
),
'Hanunoo' => array(
0 => 0x1720,
1 => 0x173F,
2 => 42,
),
'Buhid' => array(
0 => 0x1740,
1 => 0x175F,
2 => 43,
),
'Tagbanwa' => array(
0 => 0x1760,
1 => 0x177F,
2 => 44,
),
'Khmer' => array(
0 => 0x1780,
1 => 0x17FF,
2 => 45,
),
'Mongolian' => array(
0 => 0x1800,
1 => 0x18AF,
2 => 46,
),
'Unified Canadian Aboriginal Syllabics Extended' => array(
0 => 0x18B0,
1 => 0x18FF,
2 => 47,
),
'Limbu' => array(
0 => 0x1900,
1 => 0x194F,
2 => 48,
),
'Tai Le' => array(
0 => 0x1950,
1 => 0x197F,
2 => 49,
),
'New Tai Lue' => array(
0 => 0x1980,
1 => 0x19DF,
2 => 50,
),
'Khmer Symbols' => array(
0 => 0x19E0,
1 => 0x19FF,
2 => 51,
),
'Buginese' => array(
0 => 0x1A00,
1 => 0x1A1F,
2 => 52,
),
'Tai Tham' => array(
0 => 0x1A20,
1 => 0x1AAF,
2 => 53,
),
'Balinese' => array(
0 => 0x1B00,
1 => 0x1B7F,
2 => 54,
),
'Sundanese' => array(
0 => 0x1B80,
1 => 0x1BBF,
2 => 55,
),
'Batak' => array(
0 => 0x1BC0,
1 => 0x1BFF,
2 => 56,
),
'Lepcha' => array(
0 => 0x1C00,
1 => 0x1C4F,
2 => 57,
),
'Ol Chiki' => array(
0 => 0x1C50,
1 => 0x1C7F,
2 => 58,
),
'Vedic Extensions' => array(
0 => 0x1CD0,
1 => 0x1CFF,
2 => 59,
),
'Phonetic Extensions' => array(
0 => 0x1D00,
1 => 0x1D7F,
2 => 60,
),
'Phonetic Extensions Supplement' => array(
0 => 0x1D80,
1 => 0x1DBF,
2 => 61,
),
'Combining Diacritical Marks Supplement' => array(
0 => 0x1DC0,
1 => 0x1DFF,
2 => 62,
),
'Latin Extended Additional' => array(
0 => 0x1E00,
1 => 0x1EFF,
2 => 63,
),
'Greek Extended' => array(
0 => 0x1F00,
1 => 0x1FFF,
2 => 64,
),
'General Punctuation' => array(
0 => 0x2000,
1 => 0x206F,
2 => 65,
),
'Superscripts and Subscripts' => array(
0 => 0x2070,
1 => 0x209F,
2 => 66,
),
'Currency Symbols' => array(
0 => 0x20A0,
1 => 0x20CF,
2 => 67,
),
'Combining Diacritical Marks for Symbols' => array(
0 => 0x20D0,
1 => 0x20FF,
2 => 68,
),
'Letterlike Symbols' => array(
0 => 0x2100,
1 => 0x214F,
2 => 69,
),
'Number Forms' => array(
0 => 0x2150,
1 => 0x218F,
2 => 70,
),
'Arrows' => array(
0 => 0x2190,
1 => 0x21FF,
2 => 71,
),
'Mathematical Operators' => array(
0 => 0x2200,
1 => 0x22FF,
2 => 72,
),
'Miscellaneous Technical' => array(
0 => 0x2300,
1 => 0x23FF,
2 => 73,
),
'Control Pictures' => array(
0 => 0x2400,
1 => 0x243F,
2 => 74,
),
'Optical Character Recognition' => array(
0 => 0x2440,
1 => 0x245F,
2 => 75,
),
'Enclosed Alphanumerics' => array(
0 => 0x2460,
1 => 0x24FF,
2 => 76,
),
'Box Drawing' => array(
0 => 0x2500,
1 => 0x257F,
2 => 77,
),
'Block Elements' => array(
0 => 0x2580,
1 => 0x259F,
2 => 78,
),
'Geometric Shapes' => array(
0 => 0x25A0,
1 => 0x25FF,
2 => 79,
),
'Miscellaneous Symbols' => array(
0 => 0x2600,
1 => 0x26FF,
2 => 80,
),
'Dingbats' => array(
0 => 0x2700,
1 => 0x27BF,
2 => 81,
),
'Miscellaneous Mathematical Symbols-A' => array(
0 => 0x27C0,
1 => 0x27EF,
2 => 82,
),
'Supplemental Arrows-A' => array(
0 => 0x27F0,
1 => 0x27FF,
2 => 83,
),
'Braille Patterns' => array(
0 => 0x2800,
1 => 0x28FF,
2 => 84,
),
'Supplemental Arrows-B' => array(
0 => 0x2900,
1 => 0x297F,
2 => 85,
),
'Miscellaneous Mathematical Symbols-B' => array(
0 => 0x2980,
1 => 0x29FF,
2 => 86,
),
'Supplemental Mathematical Operators' => array(
0 => 0x2A00,
1 => 0x2AFF,
2 => 87,
),
'Miscellaneous Symbols and Arrows' => array(
0 => 0x2B00,
1 => 0x2BFF,
2 => 88,
),
'Glagolitic' => array(
0 => 0x2C00,
1 => 0x2C5F,
2 => 89,
),
'Latin Extended-C' => array(
0 => 0x2C60,
1 => 0x2C7F,
2 => 90,
),
'Coptic' => array(
0 => 0x2C80,
1 => 0x2CFF,
2 => 91,
),
'Georgian Supplement' => array(
0 => 0x2D00,
1 => 0x2D2F,
2 => 92,
),
'Tifinagh' => array(
0 => 0x2D30,
1 => 0x2D7F,
2 => 93,
),
'Ethiopic Extended' => array(
0 => 0x2D80,
1 => 0x2DDF,
2 => 94,
),
'Cyrillic Extended-A' => array(
0 => 0x2DE0,
1 => 0x2DFF,
2 => 95,
),
'Supplemental Punctuation' => array(
0 => 0x2E00,
1 => 0x2E7F,
2 => 96,
),
'CJK Radicals Supplement' => array(
0 => 0x2E80,
1 => 0x2EFF,
2 => 97,
),
'Kangxi Radicals' => array(
0 => 0x2F00,
1 => 0x2FDF,
2 => 98,
),
'Ideographic Description Characters' => array(
0 => 0x2FF0,
1 => 0x2FFF,
2 => 99,
),
'CJK Symbols and Punctuation' => array(
0 => 0x3000,
1 => 0x303F,
2 => 100,
),
'Hiragana' => array(
0 => 0x3040,
1 => 0x309F,
2 => 101,
),
'Katakana' => array(
0 => 0x30A0,
1 => 0x30FF,
2 => 102,
),
'Bopomofo' => array(
0 => 0x3100,
1 => 0x312F,
2 => 103,
),
'Hangul Compatibility Jamo' => array(
0 => 0x3130,
1 => 0x318F,
2 => 104,
),
'Kanbun' => array(
0 => 0x3190,
1 => 0x319F,
2 => 105,
),
'Bopomofo Extended' => array(
0 => 0x31A0,
1 => 0x31BF,
2 => 106,
),
'CJK Strokes' => array(
0 => 0x31C0,
1 => 0x31EF,
2 => 107,
),
'Katakana Phonetic Extensions' => array(
0 => 0x31F0,
1 => 0x31FF,
2 => 108,
),
'Enclosed CJK Letters and Months' => array(
0 => 0x3200,
1 => 0x32FF,
2 => 109,
),
'CJK Compatibility' => array(
0 => 0x3300,
1 => 0x33FF,
2 => 110,
),
'CJK Unified Ideographs Extension A' => array(
0 => 0x3400,
1 => 0x4DBF,
2 => 111,
),
'Yijing Hexagram Symbols' => array(
0 => 0x4DC0,
1 => 0x4DFF,
2 => 112,
),
'CJK Unified Ideographs' => array(
0 => 0x4E00,
1 => 0x9FFF,
2 => 113,
),
'Yi Syllables' => array(
0 => 0xA000,
1 => 0xA48F,
2 => 114,
),
'Yi Radicals' => array(
0 => 0xA490,
1 => 0xA4CF,
2 => 115,
),
'Lisu' => array(
0 => 0xA4D0,
1 => 0xA4FF,
2 => 116,
),
'Vai' => array(
0 => 0xA500,
1 => 0xA63F,
2 => 117,
),
'Cyrillic Extended-B' => array(
0 => 0xA640,
1 => 0xA69F,
2 => 118,
),
'Bamum' => array(
0 => 0xA6A0,
1 => 0xA6FF,
2 => 119,
),
'Modifier Tone Letters' => array(
0 => 0xA700,
1 => 0xA71F,
2 => 120,
),
'Latin Extended-D' => array(
0 => 0xA720,
1 => 0xA7FF,
2 => 121,
),
'Syloti Nagri' => array(
0 => 0xA800,
1 => 0xA82F,
2 => 122,
),
'Common Indic Number Forms' => array(
0 => 0xA830,
1 => 0xA83F,
2 => 123,
),
'Phags-pa' => array(
0 => 0xA840,
1 => 0xA87F,
2 => 124,
),
'Saurashtra' => array(
0 => 0xA880,
1 => 0xA8DF,
2 => 125,
),
'Devanagari Extended' => array(
0 => 0xA8E0,
1 => 0xA8FF,
2 => 126,
),
'Kayah Li' => array(
0 => 0xA900,
1 => 0xA92F,
2 => 127,
),
'Rejang' => array(
0 => 0xA930,
1 => 0xA95F,
2 => 128,
),
'Hangul Jamo Extended-A' => array(
0 => 0xA960,
1 => 0xA97F,
2 => 129,
),
'Javanese' => array(
0 => 0xA980,
1 => 0xA9DF,
2 => 130,
),
'Cham' => array(
0 => 0xAA00,
1 => 0xAA5F,
2 => 131,
),
'Myanmar Extended-A' => array(
0 => 0xAA60,
1 => 0xAA7F,
2 => 132,
),
'Tai Viet' => array(
0 => 0xAA80,
1 => 0xAADF,
2 => 133,
),
'Ethiopic Extended-A' => array(
0 => 0xAB00,
1 => 0xAB2F,
2 => 134,
),
'Meetei Mayek' => array(
0 => 0xABC0,
1 => 0xABFF,
2 => 135,
),
'Hangul Syllables' => array(
0 => 0xAC00,
1 => 0xD7AF,
2 => 136,
),
'Hangul Jamo Extended-B' => array(
0 => 0xD7B0,
1 => 0xD7FF,
2 => 137,
),
'High Surrogates' => array(
0 => 0xD800,
1 => 0xDB7F,
2 => 138,
),
'High Private Use Surrogates' => array(
0 => 0xDB80,
1 => 0xDBFF,
2 => 139,
),
'Low Surrogates' => array(
0 => 0xDC00,
1 => 0xDFFF,
2 => 140,
),
'Private Use Area' => array(
0 => 0xE000,
1 => 0xF8FF,
2 => 141,
),
'CJK Compatibility Ideographs' => array(
0 => 0xF900,
1 => 0xFAFF,
2 => 142,
),
'Alphabetic Presentation Forms' => array(
0 => 0xFB00,
1 => 0xFB4F,
2 => 143,
),
'Arabic Presentation Forms-A' => array(
0 => 0xFB50,
1 => 0xFDFF,
2 => 144,
),
'Variation Selectors' => array(
0 => 0xFE00,
1 => 0xFE0F,
2 => 145,
),
'Vertical Forms' => array(
0 => 0xFE10,
1 => 0xFE1F,
2 => 146,
),
'Combining Half Marks' => array(
0 => 0xFE20,
1 => 0xFE2F,
2 => 147,
),
'CJK Compatibility Forms' => array(
0 => 0xFE30,
1 => 0xFE4F,
2 => 148,
),
'Small Form Variants' => array(
0 => 0xFE50,
1 => 0xFE6F,
2 => 149,
),
'Arabic Presentation Forms-B' => array(
0 => 0xFE70,
1 => 0xFEFF,
2 => 150,
),
'Halfwidth and Fullwidth Forms' => array(
0 => 0xFF00,
1 => 0xFFEF,
2 => 151,
),
'Specials' => array(
0 => 0xFFF0,
1 => 0xFFFF,
2 => 152,
),
'Linear B Syllabary' => array(
0 => 0x10000,
1 => 0x1007F,
2 => 153,
),
'Linear B Ideograms' => array(
0 => 0x10080,
1 => 0x100FF,
2 => 154,
),
'Aegean Numbers' => array(
0 => 0x10100,
1 => 0x1013F,
2 => 155,
),
'Ancient Greek Numbers' => array(
0 => 0x10140,
1 => 0x1018F,
2 => 156,
),
'Ancient Symbols' => array(
0 => 0x10190,
1 => 0x101CF,
2 => 157,
),
'Phaistos Disc' => array(
0 => 0x101D0,
1 => 0x101FF,
2 => 158,
),
'Lycian' => array(
0 => 0x10280,
1 => 0x1029F,
2 => 159,
),
'Carian' => array(
0 => 0x102A0,
1 => 0x102DF,
2 => 160,
),
'Old Italic' => array(
0 => 0x10300,
1 => 0x1032F,
2 => 161,
),
'Gothic' => array(
0 => 0x10330,
1 => 0x1034F,
2 => 162,
),
'Ugaritic' => array(
0 => 0x10380,
1 => 0x1039F,
2 => 163,
),
'Old Persian' => array(
0 => 0x103A0,
1 => 0x103DF,
2 => 164,
),
'Deseret' => array(
0 => 0x10400,
1 => 0x1044F,
2 => 165,
),
'Shavian' => array(
0 => 0x10450,
1 => 0x1047F,
2 => 166,
),
'Osmanya' => array(
0 => 0x10480,
1 => 0x104AF,
2 => 167,
),
'Cypriot Syllabary' => array(
0 => 0x10800,
1 => 0x1083F,
2 => 168,
),
'Imperial Aramaic' => array(
0 => 0x10840,
1 => 0x1085F,
2 => 169,
),
'Phoenician' => array(
0 => 0x10900,
1 => 0x1091F,
2 => 170,
),
'Lydian' => array(
0 => 0x10920,
1 => 0x1093F,
2 => 171,
),
'Kharoshthi' => array(
0 => 0x10A00,
1 => 0x10A5F,
2 => 172,
),
'Old South Arabian' => array(
0 => 0x10A60,
1 => 0x10A7F,
2 => 173,
),
'Avestan' => array(
0 => 0x10B00,
1 => 0x10B3F,
2 => 174,
),
'Inscriptional Parthian' => array(
0 => 0x10B40,
1 => 0x10B5F,
2 => 175,
),
'Inscriptional Pahlavi' => array(
0 => 0x10B60,
1 => 0x10B7F,
2 => 176,
),
'Old Turkic' => array(
0 => 0x10C00,
1 => 0x10C4F,
2 => 177,
),
'Rumi Numeral Symbols' => array(
0 => 0x10E60,
1 => 0x10E7F,
2 => 178,
),
'Brahmi' => array(
0 => 0x11000,
1 => 0x1107F,
2 => 179,
),
'Kaithi' => array(
0 => 0x11080,
1 => 0x110CF,
2 => 180,
),
'Cuneiform' => array(
0 => 0x12000,
1 => 0x123FF,
2 => 181,
),
'Cuneiform Numbers and Punctuation' => array(
0 => 0x12400,
1 => 0x1247F,
2 => 182,
),
'Egyptian Hieroglyphs' => array(
0 => 0x13000,
1 => 0x1342F,
2 => 183,
),
'Bamum Supplement' => array(
0 => 0x16800,
1 => 0x16A3F,
2 => 184,
),
'Kana Supplement' => array(
0 => 0x1B000,
1 => 0x1B0FF,
2 => 185,
),
'Byzantine Musical Symbols' => array(
0 => 0x1D000,
1 => 0x1D0FF,
2 => 186,
),
'Musical Symbols' => array(
0 => 0x1D100,
1 => 0x1D1FF,
2 => 187,
),
'Ancient Greek Musical Notation' => array(
0 => 0x1D200,
1 => 0x1D24F,
2 => 188,
),
'Tai Xuan Jing Symbols' => array(
0 => 0x1D300,
1 => 0x1D35F,
2 => 189,
),
'Counting Rod Numerals' => array(
0 => 0x1D360,
1 => 0x1D37F,
2 => 190,
),
'Mathematical Alphanumeric Symbols' => array(
0 => 0x1D400,
1 => 0x1D7FF,
2 => 191,
),
'Mahjong Tiles' => array(
0 => 0x1F000,
1 => 0x1F02F,
2 => 192,
),
'Domino Tiles' => array(
0 => 0x1F030,
1 => 0x1F09F,
2 => 193,
),
'Playing Cards' => array(
0 => 0x1F0A0,
1 => 0x1F0FF,
2 => 194,
),
'Enclosed Alphanumeric Supplement' => array(
0 => 0x1F100,
1 => 0x1F1FF,
2 => 195,
),
'Enclosed Ideographic Supplement' => array(
0 => 0x1F200,
1 => 0x1F2FF,
2 => 196,
),
'Miscellaneous Symbols And Pictographs' => array(
0 => 0x1F300,
1 => 0x1F5FF,
2 => 197,
),
'Emoticons' => array(
0 => 0x1F600,
1 => 0x1F64F,
2 => 198,
),
'Transport And Map Symbols' => array(
0 => 0x1F680,
1 => 0x1F6FF,
2 => 199,
),
'Alchemical Symbols' => array(
0 => 0x1F700,
1 => 0x1F77F,
2 => 200,
),
'CJK Unified Ideographs Extension B' => array(
0 => 0x20000,
1 => 0x2A6DF,
2 => 201,
),
'CJK Unified Ideographs Extension C' => array(
0 => 0x2A700,
1 => 0x2B73F,
2 => 202,
),
'CJK Unified Ideographs Extension D' => array(
0 => 0x2B740,
1 => 0x2B81F,
2 => 203,
),
'CJK Compatibility Ideographs Supplement' => array(
0 => 0x2F800,
1 => 0x2FA1F,
2 => 204,
),
'Tags' => array(
0 => 0xE0000,
1 => 0xE007F,
2 => 205,
),
'Variation Selectors Supplement' => array(
0 => 0xE0100,
1 => 0xE01EF,
2 => 206,
),
'Supplementary Private Use Area-A' => array(
0 => 0xF0000,
1 => 0xFFFFF,
2 => 207,
),
'Supplementary Private Use Area-B' => array(
0 => 0x100000,
1 => 0x10FFFF,
2 => 208,
),
);
#calling the methods of this class only statically!
private function __construct() {}
/**
* Remove combining diactrical marks, with possibility of the restore
* Удаляет диакритические знаки в тексте, с возможностью восстановления (опция)
*
* @param string|null $s
* @param array|null $additional_chars for example: "xc2xad" #soft hyphen = discretionary hyphen
* @param bool $is_can_restored
* @param array|null &$restore_table
* @return string|bool|null Returns FALSE if error occurred
*/
public static function diactrical_remove($s, $additional_chars = null, $is_can_restored = false, &$restore_table = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if ($additional_chars)
{
foreach ($additional_chars as $k => &$v) $v = preg_quote($v, '/');
$re = '/((?>' . self::$diactrical_re . '|' . implode('|', $additional_chars) . ')+)/sxSX';
}
else $re = '/((?>' . self::$diactrical_re . ')+)/sxSX';
if (! $is_can_restored) return preg_replace($re, '', $s);
$restore_table = array();
$a = preg_split($re, $s, -1, PREG_SPLIT_DELIM_CAPTURE);
$c = count($a);
if ($c === 1) return $s;
$pos = 0;
$s2 = '';
for ($i = 0; $i < $c - 1; $i += 2)
{
$s2 .= $a[$i];
#запоминаем символьные (не байтовые!) позиции
$pos += self::strlen($a[$i]);
$restore_table['offsets'][$pos] = $a[$i + 1];
}
$restore_table['length'] = $pos + self::strlen(end($a));
return $s2 . end($a);
}
/**
* Restore combining diactrical marks, removed by self::diactrical_remove()
* In Russian:
* Восстанавливает диакритические знаки в тексте, при условии, что их символьные позиции и кол-во символов не изменились!
*
* @see self::diactrical_remove()
* @param string|null $s
* @param array $restore_table
* @return string|bool|null Returns FALSE if error occurred (broken $restore_table)
*/
public static function diactrical_restore($s, array $restore_table)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if (! $restore_table) return $s;
if (! is_int(@$restore_table['length']) ||
! is_array(@$restore_table['offsets']) ||
$restore_table['length'] !== self::strlen($s)) return false;
$a = array();
$length = $offset = 0;
$s2 = '';
foreach ($restore_table['offsets'] as $pos => $diactricals)
{
$length = $pos - $offset;
$s2 .= self::substr($s, $offset, $length) . $diactricals;
$offset = $pos;
}
return $s2 . self::substr($s, $offset, strlen($s));
}
/**
* Encodes data from another character encoding to UTF-8.
*
* @param array|scalar|null $data
* @param string $charset
* @return array|scalar|null Returns FALSE if error occurred
*/
public static function convert_from($data, $charset = 'cp1251')
{
if (! ReflectionTypeHint::isValid()) return false;
return self::_convert($data, $charset, 'UTF-8');
}
/**
* Encodes data from UTF-8 to another character encoding.
*
* @param array|scalar|null $data
* @param string $charset
* @return array|scalar|null Returns FALSE if error occurred
*/
public static function convert_to($data, $charset = 'cp1251')
{
if (! ReflectionTypeHint::isValid()) return false;
return self::_convert($data, 'UTF-8', $charset);
}
/**
* Recoding the data of any structure to/from UTF-8.
* Arrays traversed recursively, recoded keys and values.
*
* @see mb_encoding_aliases()
* @param array|scalar|null $data
* @param string $charset_from
* @param string $charset_to
* @return array|scalar|null Returns FALSE if error occurred
*/
private static function _convert($data, $charset_from, $charset_to)
{
if (! ReflectionTypeHint::isValid()) return false; #for recursive calls
if ($charset_from === $charset_to) return $data;
if (is_array($data))
{
$d = array();
foreach ($data as $k => &$v)
{
$k = self::_convert($k, $charset_from, $charset_to);
if ($k === false) return false;
$d[$k] = self::_convert($v, $charset_from, $charset_to);
if ($d[$k] === false && ! is_bool($v)) return false;
}
return $d;
}
if (is_string($data))
{
#smart behaviour for errors protected + speed improve
if ($charset_from === 'UTF-8' && ! self::is_utf8($data)) return $data;
if ($charset_to === 'UTF-8' && self::is_utf8($data)) return $data;
#since PHP-5.3.x iconv() faster then mb_convert_encoding()
if (function_exists('iconv')) return iconv($charset_from, $charset_to . '//IGNORE//TRANSLIT', $data);
if (function_exists('mb_convert_encoding')) return mb_convert_encoding($data, $charset_to, $charset_from);
#charset_from
if ($charset_from === 'UTF-16' || $charset_from === 'UCS-2') return self::_convert_from_utf16($data);
if ($charset_from === 'cp1251' || $charset_from === 'cp1259') return strtr($data, self::$cp1259_table);
if ($charset_from === 'koi8-r' || $charset_from === 'KOI8-R') return strtr(convert_cyr_string($data, 'k', 'w'), self::$cp1259_table);
if ($charset_from === 'iso8859-5') return strtr(convert_cyr_string($data, 'i', 'w'), self::$cp1259_table);
if ($charset_from === 'cp866') return strtr(convert_cyr_string($data, 'a', 'w'), self::$cp1259_table);
if ($charset_from === 'mac-cyrillic') return strtr(convert_cyr_string($data, 'm', 'w'), self::$cp1259_table);
#charset_to
if ($charset_to === 'cp1251' || $charset_to === 'cp1259') return strtr($data, array_flip(self::$cp1259_table));
#last trying
if (function_exists('recode_string'))
{
$s = @recode_string($charset_from . '..' . $charset_to, $data);
if (is_string($s)) return $s;
}
trigger_error('Convert "' . $charset_from . '" --> "' . $charset_to . '" is not supported native, "iconv" or "mbstring" extension required', E_USER_WARNING);
return false;
}
return $data;
}
/**
* Convert UTF-16 / UCS-2 encoding string to UTF-8.
* Surrogates UTF-16 are supported!
*
* In Russian:
* Преобразует строку из кодировки UTF-16 / UCS-2 в UTF-8.
* Суррогаты UTF-16 поддерживаются!
*
* @param string $s
* @param string $type 'BE' -- big endian byte order
* 'LE' -- little endian byte order
* @param bool $to_array returns array chars instead whole string?
* @return string|array|bool UTF-8 string, array chars or FALSE if error occurred
*/
private static function _convert_from_utf16($s, $type = 'BE', $to_array = false)
{
static $types = array(
'BE' => 'n', #unsigned short (always 16 bit, big endian byte order)
'LE' => 'v', #unsigned short (always 16 bit, little endian byte order)
);
if (! array_key_exists($type, $types))
{
trigger_error('Unexpected value in 2-nd parameter, "' . $type . '" given!', E_USER_WARNING);
return false;
}
#the fastest way:
if (function_exists('iconv') || function_exists('mb_convert_encoding'))
{
if (function_exists('iconv')) $s = iconv('UTF-16' . $type, 'UTF-8', $s);
elseif (function_exists('mb_convert_encoding')) $s = mb_convert_encoding($s, 'UTF-8', 'UTF-16' . $type);
if (! $to_array) return $s;
return self::str_split($s);
}
/*
http://en.wikipedia.org/wiki/UTF-16
The improvement that UTF-16 made over UCS-2 is its ability to encode
characters in planes 1-16, not just those in plane 0 (BMP).
UTF-16 represents non-BMP characters (those from U+10000 through U+10FFFF)
using a pair of 16-bit words, known as a surrogate pair.
First 1000016 is subtracted from the code point to give a 20-bit value.
This is then split into two separate 10-bit values each of which is represented
as a surrogate with the most significant half placed in the first surrogate.
To allow safe use of simple word-oriented string processing, separate ranges
of values are used for the two surrogates: 0xD800-0xDBFF for the first, most
significant surrogate and 0xDC00-0xDFFF for the second, least significant surrogate.
For example, the character at code point U+10000 becomes the code unit sequence 0xD800 0xDC00,
and the character at U+10FFFD, the upper limit of Unicode, becomes the sequence 0xDBFF 0xDFFD.
Unicode and ISO/IEC 10646 do not, and will never, assign characters to any of the code points
in the U+D800-U+DFFF range, so an individual code value from a surrogate pair does not ever
represent a character.
http://www.russellcottrell.com/greek/utilities/SurrogatePairCalculator.htm
http://www.russellcottrell.com/greek/utilities/UnicodeRanges.htm
Conversion of a Unicode scalar value S to a surrogate pair <H, L>:
H = Math.floor((S - 0x10000) / 0x400) + 0xD800;
L = ((S - 0x10000) % 0x400) + 0xDC00;
The conversion of a surrogate pair <H, L> to a scalar value:
N = ((H - 0xD800) * 0x400) + (L - 0xDC00) + 0x10000;
*/
$a = array();
$hi = false;
foreach (unpack($types[$type] . '*', $s) as $codepoint)
{
#surrogate process
if ($hi !== false)
{
$lo = $codepoint;
if ($lo < 0xDC00 || $lo > 0xDFFF) $a[] = "xEFxBFxBD"; #U+FFFD REPLACEMENT CHARACTER (for broken char)
else
{
$codepoint = (($hi - 0xD800) * 0x400) + ($lo - 0xDC00) + 0x10000;
$a[] = self::chr($codepoint);
}
$hi = false;
}
elseif ($codepoint < 0xD800 || $codepoint > 0xDBFF) $a[] = self::chr($codepoint); #not surrogate
else $hi = $codepoint; #surrogate was found
}
return $to_array ? $a : implode('', $a);
}
/**
* Strips out device control codes in the ASCII range.
*
* @param string|null String to clean
* @return string|bool|null Returns FALSE if error occurred
*/
public static function strict($s)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
return preg_replace('/[x00-x08x0Bx0Cx0E-x1F]+/sSX', '', $s);
}
/**
* Check the data accessory to the class of characters ASCII.
* For null, integer, float, boolean returns TRUE.
*
* Массивы обходятся рекурсивно, если в хотябы одном элементе массива
* его значение не ASCII, возвращается FALSE.
*
* @param array|scalar|null $data
* @return bool
*/
public static function is_ascii($data)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_array($data))
{
foreach ($data as $k => &$v)
{
if (! self::is_ascii($k) || ! self::is_ascii($v)) return false;
}
return true;
}
#ltrim() little faster then preg_match()
#if (is_string($data)) return preg_match('/^[x00-x7f]*$/sSX', $data); #deprecated
if (is_string($data)) return ltrim($data, "x00..x7f") === '';
if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean
return false; #object or resource
}
/**
* Returns true if data is valid UTF-8 and false otherwise.
* For null, integer, float, boolean returns TRUE.
*
* The arrays are traversed recursively, if At least one element of the array
* its value is not in UTF-8, returns FALSE.
*
* @link http://www.w3.org/International/questions/qa-forms-utf-8.html
* @link http://ru3.php.net/mb_detect_encoding
* @link http://webtest.philigon.ru/articles/utf8/
* @link http://unicode.coeurlumiere.com/
* @param array|scalar|null $data
* @param bool $is_strict strict the range of ASCII?
* @return bool
*/
public static function is_utf8($data, $is_strict = true)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_array($data))
{
foreach ($data as $k => &$v)
{
if (! self::is_utf8($k, $is_strict) || ! self::is_utf8($v, $is_strict)) return false;
}
return true;
}
if (is_string($data))
{
if (! preg_match('~~suSX', $data)) return false;
if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false;
#preg_match('~~suSX') much faster (up to 4 times), then mb_check_encoding($data, 'UTF-8')!
#if (function_exists('mb_check_encoding') && ! mb_check_encoding($data, 'UTF-8')) return false; #DEPRECATED
if ($is_strict && preg_match('/[^x09x0Ax0Dx20-xBFxC2-xF7]/sSX', $data)) return false;
return true;
}
if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean
return false; #object or resource
}
/**
* Tries to detect if a string is in Unicode encoding
*
* @deprecated Slowly, use self::is_utf8() instead
* @see self::is_utf8()
* @param string $s текст
* @param bool $is_strict строгая проверка диапазона ASCII?
* @return bool
*/
public static function check($s, $is_strict = true)
{
if (! ReflectionTypeHint::isValid()) return false;
for ($i = 0, $len = strlen($s); $i < $len; $i++)
{
$c = ord($s[$i]);
if ($c < 0x80) #1 byte 0bbbbbbb
{
if ($is_strict === false || ($c > 0x1F && $c < 0x7F) || $c == 0x09 || $c == 0x0A || $c == 0x0D) continue;
}
if (($c & 0xE0) == 0xC0) $n = 1; #2 bytes 110bbbbb 10bbbbbb
elseif (($c & 0xF0) == 0xE0) $n = 2; #3 bytes 1110bbbb 10bbbbbb 10bbbbbb
elseif (($c & 0xF8) == 0xF0) $n = 3; #4 bytes 11110bbb 10bbbbbb 10bbbbbb 10bbbbbb
elseif (($c & 0xFC) == 0xF8) $n = 4; #5 bytes 111110bb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
elseif (($c & 0xFE) == 0xFC) $n = 5; #6 bytes 1111110b 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb 10bbbbbb
else return false; #does not match any model
#n bytes matching 10bbbbbb follow ?
for ($j = 0; $j < $n; $j++)
{
$i++;
if ($i == $len || ((ord($s[$i]) & 0xC0) != 0x80) ) return false;
}
}
return true;
}
/**
* Check the data in UTF-8 charset on given ranges of the standard UNICODE.
* The suitable alternative to regular expressions.
*
* For null, integer, float, boolean returns TRUE.
*
* Arrays traversed recursively (keys and values).
* At least if one array element value is not passed checking, it returns FALSE.
*
* @example
* #A simple check the standard named ranges:
* UTF8::blocks_check('поисковые системы Google и Yandex', array('Basic Latin', 'Cyrillic'));
* #You can check the named, direct ranges or codepoints together:
* UTF8::blocks_check('поисковые системы Google и Yandex', array(array(0x20, 0x7E), #[x20-x7E]
* array(0x0410, 0x044F), #[A-Яa-я]
* 0x0401, #russian yo (Ё)
* 0x0451, #russian ye (ё)
* 'Arrows',
* ));
*
* @link http://www.unicode.org/charts/
* @param array|scalar|null $data
* @param array|string $blocks
* @return bool Возвращает TRUE, если все символы из текста принадлежат указанным диапазонам
* и FALSE в противном случае или для разбитого UTF-8.
*/
public static function blocks_check($data, $blocks)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_array($data))
{
foreach ($data as $k => &$v)
{
if (! self::blocks_check($k, $blocks) || ! self::blocks_check($v, $blocks)) return false;
}
return true;
}
if (is_string($data))
{
$chars = self::str_split($data);
if ($chars === false) return false; #broken UTF-8
unset($data); #memory free
$skip = array(); #save to cache already checked symbols
foreach ($chars as $i => $char)
{
if (array_key_exists($char, $skip)) continue; #speed improve
$codepoint = self::ord($char);
if ($codepoint === false) return false; #broken UTF-8
$is_valid = false;
$blocks = (array)$blocks;
foreach ($blocks as $j => $block)
{
if (is_string($block))
{
if (! array_key_exists($block, self::$unicode_blocks))
{
trigger_error('Unknown block "' . $block . '"!', E_USER_WARNING);
return false;
}
list ($min, $max) = self::$unicode_blocks[$block];
}
elseif (is_array($block)) list ($min, $max) = $block;
elseif (is_int($block)) $min = $max = $block;
else trigger_error('A string/array/int type expected for block[' . $j . ']!', E_USER_ERROR);
if ($codepoint >= $min && $codepoint <= $max)
{
$is_valid = true;
break;
}
}#foreach
if (! $is_valid) return false;
$skip[$char] = null;
}#foreach
return true;
}
if (is_scalar($data) || is_null($data)) return true; #~ null, integer, float, boolean
return false; #object or resource
}
/**
* Recode $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES from $charset encoding to UTF-8, if necessary.
* A side effect is a positive protection against XSS attacks with non-printable characters on the vulnerable PHP function.
* Thus web forms can be sent to the server in 2-encoding: $charset and UTF-8.
* For example: ?тест[тест]=тест
*
* Алгоритм работы:
* 1) Функция проверяет массивы $_GET, $_POST, $_COOKIE, $_REQUEST, $_FILES
* на корректность значений элементов кодировке UTF-8.
* 2) Значения не в UTF-8 принимаются как $charset и конвертируется в UTF-8,
* при этом байты от 0x00 до 0x7F (ASCII) сохраняются как есть.
* 3) Сконвертированные значения снова проверяются.
* Если данные опять не в кодировке UTF-8, то они считаются разбитыми и функция возвращает FALSE.
*
* NOTICE
* Функция должна вызываться после self::unescape_request()!
*
* @see self::unescape_request()
* @param bool $is_hex2bin Декодировать HEX-данные?
* Пример: 0xd09ec2a0d0bad0bed0bcd0bfd0b0d0bdd0b8d0b8 => О компании
* Параметры в URL адресах иногда бывает удобно кодировать не функцией rawurlencode(),
* а использовать следующий механизм (к тому же кодирующий данные более компактно):
* '0x' . bin2hex($string)
* @param string $charset
* @return bool Возвращает TRUE, если все значения элементов массивов в кодировке UTF-8
* и FALSE + E_USER_WARNING в противном случае.
*/
public static function autoconvert_request($is_hex2bin = false, $charset = 'cp1251')
{
if (! ReflectionTypeHint::isValid()) return false;
$is_converted = false;
$is_broken = false;
foreach (array('_GET', '_POST', '_COOKIE', '_FILES') as $k => $v)
{
if (! array_key_exists($v, $GLOBALS)) continue;
#использовать array_walk_recursive() не предоставляется возможным,
#т.к. его callback функция не поддерживает передачу ключа по ссылке
$GLOBALS[$v] = self::_autoconvert_request_recursive($GLOBALS[$v], $is_converted, $is_broken, $is_hex2bin, $charset);
if ($is_broken)
{
trigger_error('Array $' . $v . ' does not have keys/values in UTF-8 charset!', E_USER_WARNING);
return false;
}
}
if ($is_converted)
{
$_REQUEST =
(isset($_COOKIE) ? $_COOKIE : array()) +
(isset($_POST) ? $_POST : array()) +
(isset($_GET) ? $_GET : array());
}
return true;
}
private static function _autoconvert_request_recursive(&$data, &$is_converted, &$is_broken, $is_hex2bin, $charset)
{
if ($is_broken) return $data; #speed improve
if (is_array($data))
{
$d = array();
foreach ($data as $k => &$v)
{
$k = self::_autoconvert_request($k, $is_converted, $is_broken, $is_hex2bin, $charset);
if ($is_broken) return $data; #speed improve
$d[$k] = self::_autoconvert_request_recursive($v, $is_converted, $is_broken, $is_hex2bin, $charset);
if ($is_broken) return $data; #speed improve
}
return $d;
}
return self::_autoconvert_request($data, $is_converted, $is_broken, $is_hex2bin, $charset);
}
private static function _autoconvert_request(&$s, &$is_converted, &$is_broken, $is_hex2bin, $charset)
{
#regexp speed improve by using strpos()
if ($is_hex2bin && strpos($s, '0x') === 0 && preg_match('/^0x((?:[da-fA-F]{2})+)$/sSX', $s, $m))
{
$s = pack('H' . strlen($m[1]), $m[1]); #hex2bin()
$is_converted = true;
}
if (! self::is_utf8($s))
{
$s = self::convert_from($s, $charset);
if ($s === false) $is_broken = true;
elseif (! self::is_utf8($s))
{
trigger_error('String 0x ' . substr(bin2hex($s), 0, 100) . '... is not UTF-8!', E_USER_WARNING);
$is_broken = true;
}
else $is_converted = true;
}
return $s;
}
/**
* Сравнение строк
*
* @param string|null $s1
* @param string|null $s2
* @param string $locale For example, 'en_CA', 'ru_RU'
* @return int|bool|null Returns FALSE if error occurred
* Returns < 0 if $s1 is less than $s2;
* > 0 if $s1 is greater than $s2;
* 0 if they are equal.
*/
public static function strcmp($s1, $s2, $locale = '')
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s1) || is_null($s2)) return null;
if (! function_exists('collator_create')) return strcmp($s1, $s2);
# PHP 5 >= 5.3.0, PECL intl >= 1.0.0
# If empty string ("") or "root" are passed, UCA rules will be used.
$c = new Collator($locale);
if (! $c)
{
# Returns an "empty" object on error. You can use intl_get_error_code() and/or intl_get_error_message() to know what happened.
trigger_error(intl_get_error_message(), E_USER_WARNING);
return false;
}
return $c->compare($s1, $s2);
}
/**
* Сравнение строк для N первых символов
*
* @param string|null $s1
* @param string|null $s2
* @param int $length
* @return int|bool|null Returns FALSE if error occurred
* Returns < 0 if $s1 is less than $s2;
* > 0 if $s1 is greater than $s2;
* 0 if they are equal.
*/
public static function strncmp($s1, $s2, $length)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s1) || is_null($s2)) return null;
return self::strcmp(self::substr($s1, 0, $length), self::substr($s2, 0, $length));
}
/**
* Implementation strcasecmp() function for UTF-8 encoding string.
*
* @param string|null $s1
* @param string|null $s2
* @return int|bool|null Returns FALSE if error occurred
* Returns < 0 if $s1 is less than $s2;
* > 0 if $s1 is greater than $s2;
* 0 if they are equal.
*/
public static function strcasecmp($s1, $s2)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s1) || is_null($s2)) return null;
return self::strcmp(self::lowercase($s1), self::lowercase($s2));
}
/**
* Converts a UTF-8 string to a UNICODE codepoints
*
* @param string|null $s UTF-8 string
* @return array|bool|null Unicode codepoints
* Returns FALSE if $s broken (not UTF-8)
*/
public static function to_unicode($s)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
$s2 = null;
#since PHP-5.3.x iconv() little faster then mb_convert_encoding()
if (function_exists('iconv')) $s2 = @iconv('UTF-8', 'UCS-4BE', $s);
elseif (function_exists('mb_convert_encoding')) $s2 = @mb_convert_encoding($s, 'UCS-4BE', 'UTF-8');
if (is_string($s2)) return array_values(unpack('N*', $s2));
if ($s2 !== null) return false;
$a = self::str_split($s);
if ($a === false) return false;
return array_map(array(__CLASS__, 'ord'), $a);
}
/**
* Converts a UNICODE codepoints to a UTF-8 string
*
* @param array|null $a Unicode codepoints
* @return string|bool|null UTF-8 string
* Returns FALSE if error occurred
*/
public static function from_unicode($a)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($a)) return $a;
#since PHP-5.3.x iconv() little faster then mb_convert_encoding()
if (function_exists('iconv'))
{
array_walk($a, function(&$cp) { $cp = pack('N', $cp); });
$s = @iconv('UCS-4BE', 'UTF-8', implode('', $a));
if (! is_string($s)) return false;
return $s;
}
if (function_exists('mb_convert_encoding'))
{
array_walk($a, function(&$cp) { $cp = pack('N', $cp); });
$s = mb_convert_encoding(implode('', $a), 'UTF-8', 'UCS-4BE');
if (! is_string($s)) return false;
return $s;
}
return implode('', array_map(array(__CLASS__, 'chr'), $a));
}
/**
* Converts a UTF-8 character to a UNICODE codepoint
*
* @param string|null $char UTF-8 character
* @return int|bool|null Unicode codepoint
* Returns FALSE if $char broken (not UTF-8)
*/
public static function ord($char)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($char)) return $char;
static $cache = array();
if (array_key_exists($char, $cache)) return $cache[$char]; #speed improve
switch (strlen($char))
{
case 1 : return $cache[$char] = ord($char);
case 2 : return $cache[$char] = (ord($char{1}) & 63) |
((ord($char{0}) & 31) << 6);
case 3 : return $cache[$char] = (ord($char{2}) & 63) |
((ord($char{1}) & 63) << 6) |
((ord($char{0}) & 15) << 12);
case 4 : return $cache[$char] = (ord($char{3}) & 63) |
((ord($char{2}) & 63) << 6) |
((ord($char{1}) & 63) << 12) |
((ord($char{0}) & 7) << 18);
default :
trigger_error('Character 0x' . bin2hex($char) . ' is not UTF-8!', E_USER_WARNING);
return false;
}
}
/**
* Converts a UNICODE codepoint to a UTF-8 character
*
* @param int|digit|null $cp Unicode codepoint
* @return string|bool|null UTF-8 character
* Returns FALSE if error occurred
*/
public static function chr($cp)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($cp)) return $cp;
static $cache = array();
if (array_key_exists($cp, $cache)) return $cache[$cp]; #speed improve
if ($cp <= 0x7f) return $cache[$cp] = chr($cp);
if ($cp <= 0x7ff) return $cache[$cp] = chr(0xc0 | ($cp >> 6)) .
chr(0x80 | ($cp & 0x3f));
if ($cp <= 0xffff) return $cache[$cp] = chr(0xe0 | ($cp >> 12)) .
chr(0x80 | (($cp >> 6) & 0x3f)) .
chr(0x80 | ($cp & 0x3f));
if ($cp <= 0x10ffff) return $cache[$cp] = chr(0xf0 | ($cp >> 18)) .
chr(0x80 | (($cp >> 12) & 0x3f)) .
chr(0x80 | (($cp >> 6) & 0x3f)) .
chr(0x80 | ($cp & 0x3f));
#U+FFFD REPLACEMENT CHARACTER
return $cache[$cp] = "xEFxBFxBD";
}
/**
* Implementation chunk_split() function for UTF-8 encoding string.
*
* @param string|null $s
* @param int|digit|null $length
* @param string|null $glue
* @return string|bool|null Returns FALSE if error occurred
*/
public static function chunk_split($s, $length = null, $glue = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
$length = intval($length);
$glue = strval($glue);
if ($length < 1) $length = 76;
if ($glue === '') $glue = "rn";
if (! is_array($a = self::str_split($s, $length))) return false;
return implode($glue, $a);
}
/**
* Changes all keys in an array
*
* @param array|null $a
* @param int $mode {CASE_LOWER|CASE_UPPER}
* @return array|bool|null Returns FALSE if error occurred
*/
public static function array_change_key_case($a, $mode)
{
if (! ReflectionTypeHint::isValid()) return false;
if (! is_array($a)) return $a;
$a2 = array();
foreach ($a as $k => $v)
{
if (is_string($k))
{
$k = self::convert_case($k, $mode);
if ($k === false) return false;
}
$a2[$k] = $v;
}
return $a2;
}
/**
* Конвертирует регистр букв в данных в кодировке UTF-8.
* Массивы обходятся рекурсивно, при этом конвертируются только значения
* в элементах массива, а ключи остаются без изменений.
* Для конвертирования только ключей используйте метод self::array_change_key_case().
*
* @see self::array_change_key_case()
* @link http://www.unicode.org/charts/PDF/U0400.pdf
* @link http://ru.wikipedia.org/wiki/ISO_639-1
* @param array|scalar|null $data Данные произвольной структуры
* @param int $mode {CASE_LOWER|CASE_UPPER}
* @param bool $is_ascii_optimization for speed improve
* @return scalar|bool|null Returns FALSE if error occurred
*/
public static function convert_case($data, $mode, $is_ascii_optimization = true)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_array($data))
{
foreach ($data as $k => &$v) $v = self::convert_case($v, $mode);
return $data;
}
if (! is_string($data) || ! $data) return $data;
if ($mode === CASE_UPPER)
{
if ($is_ascii_optimization && self::is_ascii($data)) return strtoupper($data); #speed improve!
#deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower()
#if (function_exists('mb_strtoupper')) return mb_strtoupper($data, 'utf-8');
return strtr($data, array_flip(self::$convert_case_table));
}
if ($mode === CASE_LOWER)
{
if ($is_ascii_optimization && self::is_ascii($data)) return strtolower($data); #speed improve!
#deprecated, since PHP-5.3.x strtr() 2-3 times faster then mb_strtolower()
#if (function_exists('mb_strtolower')) return mb_strtolower($data, 'utf-8');
return strtr($data, self::$convert_case_table);
}
trigger_error('Parameter 2 should be a constant of CASE_LOWER or CASE_UPPER!', E_USER_WARNING);
return $data;
}
/**
* Convert a data to lower case
*
* @param array|scalar|null $data
* @return scalar|bool|null Returns FALSE if error occurred */
public static function lowercase($data)
{
if (! ReflectionTypeHint::isValid()) return false;
return self::convert_case($data, CASE_LOWER);
}
/**
* Convert a data to upper case
*
* @param array|scalar|null $data
* @return scalar|null Returns FALSE if error occurred
*/
public static function uppercase($data)
{
if (! ReflectionTypeHint::isValid()) return false;
return self::convert_case($data, CASE_UPPER);
}
/**
* Convert a data to lower case
*
* @param array|scalar|null $data
* @return scalar|bool|null Returns FALSE if error occurred
*/
public static function strtolower($data)
{
if (! ReflectionTypeHint::isValid()) return false;
return self::convert_case($data, CASE_LOWER);
}
/**
* Convert a data to upper case
*
* @param array|scalar|null $data
* @return scalar|null Returns FALSE if error occurred
*/
public static function strtoupper($data)
{
if (! ReflectionTypeHint::isValid()) return false;
return self::convert_case($data, CASE_UPPER);
}
/**
* Convert all HTML entities to native UTF-8 characters
* Функция декодирует гораздо больше именованных сущностей, чем стандартная html_entity_decode()
* Все dec и hex сущности так же переводятся в UTF-8.
*
* Example: '"' or '"' or '"' will be converted to '"'.
*
* @link http://www.htmlhelp.com/reference/html40/entities/
* @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References)
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true
*
* @param scalar|null $s
* @param bool $is_special_chars Дополнительно обрабатывать специальные html сущности? (< > & ")
* @return scalar|null Returns FALSE if error occurred
*/
public static function html_entity_decode($s, $is_special_chars = false)
{
if (! ReflectionTypeHint::isValid()) return false;
if (! is_string($s)) return $s;
#speed improve
if (strlen($s) < 4 #по минимальной длине сущности - 4 байта: &#d; &xx;
|| ($pos = strpos($s, '&') === false) || strpos($s, ';', $pos) === false) return $s;
$table = self::$html_entity_table;
if ($is_special_chars) $table += self::$html_special_chars_table;
#replace named entities
$s = strtr($s, $table);
#block below deprecated, since PHP-5.3.x strtr() 1.5 times faster
if (0 && preg_match_all('/&[a-zA-Z]++d*+;/sSX', $s, $m, null, $pos))
{
foreach (array_unique($m[0]) as $entity)
{
if (array_key_exists($entity, $table)) $s = str_replace($entity, $table[$entity], $s);
}
}
#заменяем числовые dec и hex сущности:
if (strpos($s, '&#') !== false) #speed improve
{
$class = __CLASS__;
$html_special_chars_table_flipped = array_flip(self::$html_special_chars_table);
$s = preg_replace_callback('/&#((x)[da-fA-F]{1,6}+|d{1,7}+);/sSX',
function (array $m) use ($class, $html_special_chars_table_flipped, $is_special_chars)
{
$codepoint = isset($m[2]) && $m[2] === 'x' ? hexdec($m[1]) : $m[1];
if (! $is_special_chars)
{
$char = pack('C', $codepoint);
if (array_key_exists($char, $html_special_chars_table_flipped)) return $html_special_chars_table_flipped[$char];
}
return $class::chr($codepoint);
}, $s);
}
return $s;
}
/**
* Convert special UTF-8 characters to HTML entities.
* Функция кодирует гораздо больше именованных сущностей, чем стандартная htmlentities()
*
* @link http://www.htmlhelp.com/reference/html40/entities/
* @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References)
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true
*
* @param scalar|null $s
* @param bool $is_special_chars_only Обрабатывать только специальные html сущности? (< > & ")
* @return scalar|null Returns FALSE if error occurred
*/
public static function html_entity_encode($s, $is_special_chars_only = false)
{
if (! ReflectionTypeHint::isValid()) return false;
if (! is_string($s)) return $s;
#if ($is_special_chars_only) return strtr($s, array_flip(self::$html_special_chars_table));
if ($is_special_chars_only) return htmlspecialchars($s);
#replace UTF-8 chars to named entities:
$s = strtr($s, array_flip(self::$html_entity_table));
#block below deprecated, since PHP-5.3.x strtr() 3 times faster
if (0 && preg_match_all('~(?> [xc2xc3xc5xc6xcbxcexcf][x80-xbf] #2 bytes
| xe2[x80-x99][x82-xac] #3 bytes
)
~sxSX', $s, $m))
{
$table = array_flip(self::$html_entity_table);
foreach (array_unique($m[0]) as $char)
{
if (array_key_exists($char, $table)) $s = str_replace($char, $table[$char], $s);
}
}
return $s;
}
/**
* Make regular expression for case insensitive match
* Example (non ASCII): "123_слово_test" => "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]"
* Example (only ASCII): "123_test" => "(?i:123_test)"
*
* @param string $s
* @param string|null $delimiter If the optional delimiter is specified, it will also be escaped.
* This is useful for escaping the delimiter that is required by the PCRE functions.
* The / is the most commonly used delimiter.
* @return string|bool|null Returns FALSE if error occurred
*/
public static function preg_quote_case_insensitive($s, $delimiter = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if (self::is_ascii($s)) return '(?i:' . preg_quote($s, $delimiter) . ')'; #speed improve
$s_re = '';
$s_lc = UTF8::lowercase($s); if ($s_lc === false) return false;
$s_uc = UTF8::uppercase($s); if ($s_uc === false) return false;
$chars_lc = UTF8::str_split($s_lc); if ($chars_lc === false) return false;
$chars_uc = UTF8::str_split($s_uc); if ($chars_uc === false) return false;
foreach ($chars_lc as $i => $char)
{
if ($chars_lc[$i] === $chars_uc[$i])
$s_re .= preg_quote($chars_lc[$i], $delimiter);
elseif (self::is_ascii($chars_lc[$i]))
$s_re .= '[' . preg_quote($chars_lc[$i] . $chars_uc[$i], $delimiter) . ']';
else
$s_re .= '(' . preg_quote($chars_lc[$i], $delimiter) . '|'
. preg_quote($chars_uc[$i], $delimiter) . ')';
}
return $s_re;
}
/**
* Call preg_match_all() and convert byte offsets into character offsets for PREG_OFFSET_CAPTURE flag.
* This is regardless of whether you use /u modifier.
*
* @link http://bolknote.ru/2010/09/08/~2704
*
* @param string $pattern
* @param string|null $subject
* @param array $matches
* @param int $flags
* @param int $char_offset
* @return array|bool|null Returns FALSE if error occurred
*/
public static function preg_match_all($pattern, $subject, &$matches, $flags = PREG_PATTERN_ORDER, $char_offset = 0)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($subject)) return null;
$byte_offset = ($char_offset > 0) ? strlen(self::substr($subject, 0, $char_offset)) : $char_offset;
$return = preg_match_all($pattern, $subject, $matches, $flags, $byte_offset);
if ($return === false) return false;
if ($flags & PREG_OFFSET_CAPTURE)
{
foreach ($matches as &$match)
{
foreach ($match as &$a) $a[1] = self::strlen(substr($subject, 0, $a[1]));
}
}
return $return;
}
#alias for self::str_limit()
public static function truncate($s, $maxlength = null, $continue = "xe2x80xa6", &$is_cutted = null, $tail_min_length = 20)
{
return self::str_limit($s, $maxlength, $continue, $is_cutted, $tail_min_length);
}
/**
* Обрезает текст в кодировке UTF-8 до заданной длины,
* причём последнее слово показывается целиком, а не обрывается на середине.
* Html сущности корректно обрабатываются.
*
* @param string|null $s Текст в кодировке UTF-8
* @param int|null|digit $maxlength Ограничение длины текста
* @param string $continue Завершающая строка, которая будет вставлена после текста, если он обрежется
* @param bool|null &$is_cutted Текст был обрезан?
* @param int|digit $tail_min_length Если длина "хвоста", оставшегося после обрезки текста, меньше $tail_min_length,
* то текст возвращается без изменений
* @return string|bool|null Returns FALSE if error occurred
*/
public static function str_limit($s, $maxlength = null, $continue = "xe2x80xa6", &$is_cutted = null, $tail_min_length = 20) #"xe2x80xa6" = "…"
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
$is_cutted = false;
if ($continue === null) $continue = "xe2x80xa6";
if (! $maxlength) $maxlength = 256;
#speed improve block
#{{{
if (strlen($s) <= $maxlength) return $s;
$s2 = str_replace("rn", '?', $s);
$s2 = preg_replace('/&(?> [a-zA-Z][a-zA-Zd]+
| #(?> d{1,4}
| x[da-fA-F]{2,4}
)
); # html сущности (< > & ")
/sxSX', '?', $s2);
if (strlen($s2) <= $maxlength || self::strlen($s2) <= $maxlength) return $s;
#}}}
$r = preg_match_all('/(?> rn # переносы строк
| &(?> [a-zA-Z][a-zA-Zd]+
| #(?> d{1,4}
| x[da-fA-F]{2,4}
)
); # html сущности (< > & ")
| .
)
/sxuSX', $s, $m);
if ($r === false) return false;
#d($m);
if (count($m[0]) <= $maxlength) return $s;
$left = implode('', array_slice($m[0], 0, $maxlength));
#из диапазона ASCII исключаем буквы, цифры, открывающие парные символы [a-zA-Zd({[] и некоторые др. символы
#нельзя вырезать в конце строки символ ";", т.к. он используются в сущностях &xxx;
$left2 = rtrim($left, "x00..x28x2A..x2Fx3Ax3C..x3Ex40x5Bx5Cx5E..x60x7Bx7Cx7Ex7F");
if (strlen($left) !== strlen($left2)) $return = $left2 . $continue;
else
{
#добавляем остаток к обрезанному слову
$right = implode('', array_slice($m[0], $maxlength));
preg_match('/^(?> [d)]}-.:]+ #цифры, закрывающие парные символы, дефис для составных слов, дата, время, IP-адреса, URL типа www.ya.ru:80!
| p{L}+ #буквы
| xe2x80x9d #закрывающие кавычки
| xe2x80x99 #закрывающие кавычки
| xe2x80x9c #закрывающие кавычки
| xc2xbb #закрывающие кавычки
)+
/suxSX', $right, $m);
#d($m);
$right = isset($m[0]) ? rtrim($m[0], '.-') : '';
$return = $left . $right;
if (strlen($return) !== strlen($s)) $return .= $continue;
}
if (self::strlen($s) - self::strlen($return) < $tail_min_length) return $s;
$is_cutted = true;
return $return;
}
/**
* Implementation str_split() function for UTF-8 encoding string.
*
* @param string|null $s
* @param int|null|digit $length
* @return array|bool|null Returns FALSE if error occurred
*/
public static function str_split($s, $length = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
$length = ($length === null) ? 1 : intval($length);
if ($length < 1) return false;
#there are limits in regexp for {min,max}!
if (preg_match_all('~.~suSX', $s, $m) === false) return false;
if (function_exists('preg_last_error') && preg_last_error() !== PREG_NO_ERROR) return false;
if ($length === 1) $a = $m[0];
else
{
$a = array();
for ($i = 0, $c = count($m[0]); $i < $c; $i += $length) $a[] = implode('', array_slice($m[0], $i, $length));
}
return $a;
}
/**
* Implementation strlen() function for UTF-8 encoding string.
*
* @param string|null $s
* @return int|bool|null Returns FALSE if error occurred
*/
public static function strlen($s)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
//since PHP-5.3.x mb_strlen() faster then strlen(utf8_decode())
if (function_exists('mb_strlen')) return mb_strlen($s, 'utf-8');
/*
utf8_decode() converts characters that are not in ISO-8859-1 to '?', which, for the purpose of counting, is quite alright.
It's much faster than iconv_strlen()
Note: this function does not count bad UTF-8 bytes in the string - these are simply ignored
*/
return strlen(utf8_decode($s));
/*
#slowly then strlen(utf8_decode())
if (function_exists('iconv_strlen')) return iconv_strlen($s, 'utf-8');
#Do not count UTF-8 continuation bytes
#return strlen(preg_replace('/[x80-xBF]/sSX', '', $s));
#slowly then strlen(utf8_decode())
preg_match_all('~.~suSX', $str, $m);
return count($m[0]);
#slowly then preg_match_all() + count()
$n = 0;
for ($i = 0, $len = strlen($s); $i < $len; $i++)
{
$c = ord(substr($s, $i, 1));
if ($c < 0x80) $n++; #single-byte (0xxxxxx)
elseif (($c & 0xC0) == 0xC0) $n++; #multi-byte starting byte (11xxxxxx)
}
return $n;
*/
}
/**
* Implementation strpos() function for UTF-8 encoding string
*
* @param string|null $s The entire string
* @param string|int $needle The searched substring
* @param int|null $offset The optional offset parameter specifies the position from which the search should be performed
* @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack.
* If needle is not found, will return FALSE.
*/
public static function strpos($s, $needle, $offset = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if ($offset === null || $offset < 0) $offset = 0;
if (function_exists('mb_strpos')) return mb_strpos($s, $needle, $offset, 'utf-8');
#iconv_strpos() deprecated, because slowly than self::strlen(substr())
#if (function_exists('iconv_strpos')) return iconv_strpos($s, $needle, $offset, 'utf-8');
$byte_pos = $offset;
do if (($byte_pos = strpos($s, $needle, $byte_pos)) === false) return false;
while (($char_pos = self::strlen(substr($s, 0, $byte_pos++))) < $offset);
return $char_pos;
}
/**
* Find position of first occurrence of a case-insensitive string.
*
* @param string|null $s The entire string
* @param string|int $needle The searched substring
* @param int|null $offset The optional offset parameter specifies the position from which the search should be performed
* @return int|bool|null Returns the numeric position of the first occurrence of needle in haystack.
* If needle is not found, will return FALSE.
*/
public static function stripos($s, $needle, $offset = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if ($offset === null || $offset < 0) $offset = 0;
if (function_exists('mb_stripos')) return mb_stripos($s, $needle, $offset, 'utf-8');
#optimization block (speed improve)
#{{{
$ascii_int = intval(self::is_ascii($s)) + intval(self::is_ascii($needle));
if ($ascii_int === 1) return false;
if ($ascii_int === 2) return stripos($s, $needle, $offset);
#}}}
$s = self::convert_case($s, CASE_LOWER, false);
if ($s === false) return false;
$needle = self::convert_case($needle, CASE_LOWER, false);
if ($needle === false) return false;
return self::strpos($s, $needle, $offset);
}
/**
* Implementation strrev() function for UTF-8 encoding string
*
* @param string|null $s
* @return string|bool|null Returns FALSE if error occurred
*/
public static function strrev($s)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if (0) #TODO test speed
{
$s = self::_convert($s, 'UTF-8', 'UTF-32');
if (! is_string($s)) return false;
$s = implode('', array_reverse(str_split($s, 4)));
return self::_convert($s, 'UTF-32', 'UTF-8');
}
if (! is_array($a = self::str_split($s))) return false;
return implode('', array_reverse($a));
}
/**
* Implementation substr() function for UTF-8 encoding string.
*
* @link http://www.w3.org/International/questions/qa-forms-utf-8.html
* @param string|null $s
* @param int|digit $offset
* @param int|null|digit $length
* @return string|bool|null Returns FALSE if error occurred
*/
public static function substr($s, $offset, $length = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
#since PHP-5.3.x mb_substr() faster then iconv_substr()
if (function_exists('mb_substr'))
{
if ($length === null) $length = self::strlen($s);
return mb_substr($s, $offset, $length, 'utf-8');
}
if (function_exists('iconv_substr'))
{
if ($length === null) $length = self::strlen($s);
return iconv_substr($s, $offset, $length, 'utf-8');
}
static $_s = null;
static $_a = null;
if ($_s !== $s) $_a = self::str_split($_s = $s);
if (! is_array($_a)) return false;
if ($length !== null) $a = array_slice($_a, $offset, $length);
else $a = array_slice($_a, $offset);
return implode('', $a);
}
/**
* Implementation substr_replace() function for UTF-8 encoding string.
*
* @param string|null $s
* @param string|int $replacement
* @param int|digit $start
* @param int|null $length
* @return string|bool|null Returns FALSE if error occurred
*/
public static function substr_replace($s, $replacement, $start, $length = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if (! is_array($a = self::str_split($s))) return false;
array_splice($a, $start, $length, $replacement);
return implode('', $a);
}
/**
* Implementation ucfirst() function for UTF-8 encoding string.
* Преобразует первый символ строки в кодировке UTF-8 в верхний регистр.
*
* @param string|null $s
* @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр?
* @return string|bool|null Returns FALSE if error occurred
*/
public static function ucfirst($s, $is_other_to_lowercase = true)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if ($s === '' || ! is_string($s)) return $s;
if (! preg_match('/^(.)(.*)$/suSX', $s, $m)) return false;
return self::uppercase($m[1]) . ($is_other_to_lowercase ? self::lowercase($m[2]) : $m[2]);
}
/**
* Implementation ucwords() function for UTF-8 encoding string.
* Преобразует в верхний регистр первый символ каждого слова в строке в кодировке UTF-8,
* остальные символы каждого слова преобразуются в нижний регистр.
*
* @param string|null $s
* @param bool $is_other_to_lowercase остальные символы преобразуются в нижний регистр?
* @param string $spaces_re
* @return string|bool|null Returns FALSE if error occurred
*/
public static function ucwords($s, $is_other_to_lowercase = true, $spaces_re = '~([pZs]+)~suSX') #pXps is POSIX space: property Z or tab, NL, VT, FF, CR
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
$words = preg_split($spaces_re, $s, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
foreach ($words as $k => $word)
{
$words[$k] = self::ucfirst($word, $is_other_to_lowercase = true);
if ($words[$k] === false) return false;
}
return implode('', $words);
}
/**
* Decodes a string in the format %uXXXX or %u{XXXXXX} in the UTF-8 string.
*
* Используется для декодирования данных типа "%u0442%u0435%u0441%u0442",
* закодированных устаревшей функцией javascript://encode().
* Рекомендуется использовать функцию javascript://encodeURIComponent().
*
* NOTICE
* Устаревший формат %uXXXX позволяет использовать юникод только из диапазона UCS-2, т.е. от U+0 до U+FFFF
*
* @param scalar|array|null $data
* @param bool $is_rawurlencode
* @return scalar|array|null Returns FALSE if error occurred
*/
public static function unescape($data, $is_rawurlencode = false)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_array($data))
{
$d = array();
foreach ($data as $k => &$v)
{
$k = self::unescape($k, $is_rawurlencode);
if ($k === false) return false;
$d[$k] = self::unescape($v, $is_rawurlencode);
if ($d[$k] === false && ! is_bool($v)) return false;
}
return $d;
}
if (is_string($data))
{
if (strpos($data, '%u') === false) return $data; #use strpos() for speed improving
return preg_replace_callback('/%u( [da-fA-F]{4}+ #%uXXXX only UCS-2
| { [da-fA-F]{1,6}+ } #%u{XXXXXX} extended form for all UNICODE charts
)
/sxSX',
function (array $m) use ($is_rawurlencode)
{
$codepoint = hexdec(trim($m[1], '{}'));
$char = self::chr($codepoint);
return $is_rawurlencode ? rawurlencode($char) : $char;
},
$data);
}
if (is_scalar($data) || is_null($data)) return $data; #~ null, integer, float, boolean
return false; #object or resource
}
/**
* 1) Corrects the global arrays $_GET, $_POST, $_COOKIE, $_REQUEST
* decoded values in the format %uXXXX and %u{XXXXXX}, encoded,
* for example, through an outdated javascript function escape().
* Standard PHP5 cannot do it.
* 2) If in the HTTP_COOKIE there are parameters with the same name,
* takes the last value, not the first, as in the QUERY_STRING.
* 3) Creates an array of $_POST for non-standard Content-Type, for example, "Content-Type: application/octet-stream".
* Standard PHP5 creates an array for "Content-Type: application/x-www-form-urlencoded" and "Content-Type: multipart/form-data".
*
* Сессии, куки и независимая авторизация на поддоменах.
*
* ПРИМЕР 1
* У рабочего сайта http://domain.com появились поддомены.
* Для кроссдоменной авторизации через механизм сессий имя хоста для COOKIE было изменено с "domain.com" на ".domain.com"
* В результате авторизация не работает.
* Помогает очистка COOKIE, но их принудительная очистка на тысячах пользовательских компьютеров проблематична.
* Проблема в следующем: если в HTTP_COOKIE есть параметры с одинаковым именем, то берётся последнее значение,
* а не первое, как в QUERY_STRING.
* Более подробное описание:
* PHP не правильно (?) обрабатывает заголовок HTTP_COOKIE, если там встречаются параметры с одинаковым именем, но разными значениями.
* Пример запроса HTTP-заголовка клиентом: "Cookie: sid=chpgs2fiak-330mzqza; sid=cmz5tnp5zz-xlbbgqp"
* В этом случае сервер берёт первое значение, а не последнее.
* Хотя если в QUERY_STRING есть такая ситуация, всегда берётся последний параметр.
* В HTTP_COOKIE два параметра с одинаковым именем могут появиться, если отправить клиенту следующие HTTP-заголовки:
* "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=domain.com" (только domain.com)
* "Set-Cookie: sid=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (domain.com и все его поддомены)
* Решение: поменять имя сессии.
*
* ПРИМЕР 2
* Есть рабочие сайты: http://domain.com (основной), http://admin.domain.com (админка),
* http://sub1.domain.com (подпроект 1), http://sub2.domain.com, (подпроект 2).
* Так же имеется сервер разработки http://dev.domain.com, на котором м. б. свои поддомены.
* Требуется сделать независимую кросс-доменную авторизацию для http://*.domain.com и http://*.dev.domain.com.
* Для сохранения статуса авторизации будем использовать сессию, имя и значение которой пишется в COOKIE.
* Т. к. домены http://*.dev.domain.com имеют пересечение с доменами http://*.domain.com,
* для независимой авторизации нужно использовать разные имена сессий.
* Пример HTTP заголовков ответа сервера:
* "Set-Cookie: sid=chpgs2fiak-330mzqza; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.domain.com" (.domain.com и все его поддомены)
* "Set-Cookie: sid.dev=cmz6uqorzv-1bn35110; expires=Thu, 15 Oct 2009 14:23:42 GMT; path=/; domain=.dev.domain.com" (dev.domain.com и все его поддомены)
*
* @link http://tools.ietf.org/html/rfc2965 RFC 2965 - HTTP State Management Mechanism
* @return void
*/
public static function unescape_request()
{
$fixed = false;
#ATTENTION! HTTP_RAW_POST_DATA is only accessible when Content-Type of POST request is NOT default "application/x-www-form-urlencoded"!
$HTTP_RAW_POST_DATA = isset($_SERVER['REQUEST_METHOD']) && $_SERVER['REQUEST_METHOD'] === 'POST' ? (isset($GLOBALS['HTTP_RAW_POST_DATA']) ? $GLOBALS['HTTP_RAW_POST_DATA'] : @file_get_contents('php://input')) : null;
if (ini_get('always_populate_raw_post_data')) $GLOBALS['HTTP_RAW_POST_DATA'] = $HTTP_RAW_POST_DATA;
foreach (array( '_GET' => isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : null,
'_POST' => $HTTP_RAW_POST_DATA,
'_COOKIE' => isset($_SERVER['HTTP_COOKIE']) ? $_SERVER['HTTP_COOKIE'] : null,
) as $k => $v)
{
if (! is_string($v)) continue;
if ($k === '_COOKIE')
{
$v = preg_replace('/; *+/sSX', '&', $v);
unset($_COOKIE); #будем парсить HTTP_COOKIE сами, чтобы сделать обработку как у QUERY_STRING
}
if (strpos($v, '%u') !== false)
{
parse_str(self::unescape($v, $is_rawurlencode = true), $GLOBALS[$k]);
$fixed = true;
continue;
}
if (array_key_exists($k, $GLOBALS)) continue;
parse_str($v, $GLOBALS[$k]);
$fixed = true;
}
if ($fixed)
{
$_REQUEST =
(isset($_COOKIE) ? $_COOKIE : array()) +
(isset($_POST) ? $_POST : array()) +
(isset($_GET) ? $_GET : array());
}
}
/**
* Calculates the height of the edit text in <textarea> html tag by value and width.
*
* В большинстве случаев будет корректно работать для моноширинных шрифтов.
* Т.к. браузер переносит последнее слово, которое не умещается на строке,
* на следующую строку, высота м.б. меньше ожидаемой.
* Этот алгоритм явл. простым (и быстрым) и не отслеживает переносы слов.
*
* @param string|null $s Текст
* @param int|digit $cols Ширина области редактирования (колонок)
* @param int|digit $min_rows Минимальное кол-во строк
* @param int|digit $max_rows Максимальное кол-во строк
* @return int|bool|null Number of rows (lines)
*/
public static function textarea_rows($s, $cols, $min_rows = 3, $max_rows = 32)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if (strlen($s) == 0) return $min_rows; #speed improve
$rows = 0;
#utf8_decode() converts characters that are not in ISO-8859-1 to '?'
foreach (preg_split('/rn|[rn]/sSX', utf8_decode($s)) as $line)
{
$rows += ceil((strlen($line) + 1) / $cols);
if ($rows > $max_rows) return $max_rows;
}
return ($rows < $min_rows) ? $min_rows : $rows;
}
/**
* @param string|null $s
* @param string|null $charlist
* @return string|bool|null
*/
public static function ltrim($s, $charlist = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if ($charlist === null || self::is_ascii($charlist)) return ltrim($s);
return preg_replace('~^[' . self::_preg_quote_class($charlist, '~') . ']+~suSX', '', $s);
}
/**
* @param string|null $s
* @param string|null $charlist
* @return string|bool|null
*/
public static function rtrim($s, $charlist = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if ($charlist === null || self::is_ascii($charlist)) return rtrim($s);
return preg_replace('~[' . self::_preg_quote_class($charlist, '~') . ']+$~suSX', '', $s);
}
/**
* @param scalar|null $s
* @param string|null $charlist
* @return scalar|null
*/
public static function trim($s, $charlist = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if ($charlist === null || self::is_ascii($charlist)) return trim($s);
$charlist_re = self::_preg_quote_class($charlist, '~');
$s = preg_replace('~^[' . $charlist_re . ']+~suSX', '', $s);
return preg_replace('~[' . $charlist_re . ']+$~suSX', '', $s);
}
private static function _preg_quote_class($charlist, $delimiter = null)
{
#return preg_quote($charlist, $delimiter); #DEPRECATED
$quote_table = array(
'\' => '\\',
'-' => '-',
']' => ']',
);
if (is_string($delimiter)) $quote_table[$delimiter] = '\' . $delimiter;
return strtr($charlist, $quote_table);
}
/**
* @param string|null $s
* @param int|digit $length
* @param string $pad_str
* @param int $type STR_PAD_LEFT, STR_PAD_RIGHT or STR_PAD_BOTH
* @return string|bool|null
*/
public static function str_pad($s, $length, $pad_str = ' ', $type = STR_PAD_RIGHT)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
$input_len = self::strlen($s);
if ($length <= $input_len) return $s;
$pad_str_len = self::strlen($pad_str);
$pad_len = $length - $input_len;
if ($type == STR_PAD_RIGHT)
{
$repeat_num = ceil($pad_len / $pad_str_len);
return self::substr($s . str_repeat($pad_str, $repeat_num), 0, $length);
}
if ($type == STR_PAD_LEFT)
{
$repeat_num = ceil($pad_len / $pad_str_len);
return self::substr(str_repeat($pad_str, $repeat_num), 0, intval(floor($pad_len))) . $s;
}
if ($type == STR_PAD_BOTH)
{
$pad_len /= 2;
$pad_amount_left = intval(floor($pad_len));
$pad_amount_right = intval(ceil($pad_len));
$repeat_times_left = ceil($pad_amount_left / $pad_str_len);
$repeat_times_right = ceil($pad_amount_right / $pad_str_len);
$padding_left = self::substr(str_repeat($pad_str, $repeat_times_left), 0, $pad_amount_left);
$padding_right = self::substr(str_repeat($pad_str, $repeat_times_right), 0, $pad_amount_right);
return $padding_left . $s . $padding_right;
}
trigger_error('Parameter 4 should be a constant of STR_PAD_RIGHT, STR_PAD_LEFT or STR_PAD_BOTH!', E_USER_WARNING);
return false;
}
/**
* @param string $str
* @param string $mask
* @param int|null $start
* @param int|null $length
* @return int|bool
*/
public static function strspn($str, $mask, $start = null, $length = null)
{
if (! ReflectionTypeHint::isValid()) return false;
#if (self::is_ascii($str) && self::is_ascii($mask)) return strspn($str, $mask, $start, $length);
if ($start !== null || $length !== null) $str = self::substr($str, $start, $length);
if (preg_match('~^[' . preg_quote($mask, '~') . ']+~uSX', $str, $m)) self::strlen($m[0]);
return 0;
}
/**
* Recode the text files in a specified folder in the UTF-8
* In the processing skipped binary files, files encoded in UTF-8, files that could not convert.
* So method works reliably enough.
*
*
* @param string $dir Директория для сканирования
* @param string|null $files_re Регул. выражение для шаблона имён файлов,
* например: '~.(?:txt|sql|php|pl|py|sh|tpl|xml|xsl|html|xhtml|phtml|htm|js|json|css|conf|cfg|ini|htaccess)$~sSX'
* @param bool $is_recursive Обрабатывать вложенные папки и файлы?
* @param string $charset Исходная кодировка
* @param string|null $dirs_ignore_re Регул. выражение для исключения папок из обработки
* например: '~^(?:cache|images?|photos?|fonts?|img|ico|.svn|.hg|.cvs)$~siSX'
* @param bool $is_echo Печать имён обработанных файлов и статус обработки в выходной поток?
* @param bool $is_simulate Сымитировать работу без реальной перезаписи файлов?
* @return int|bool Возвращает кол-во перекодированных файлов
* Returns FALSE if error occurred
*/
public static function convert_files_from(
$dir,
$files_re = null,
$is_recursive = true,
$charset = 'cp1251',
$dirs_ignore_re = null,
$is_echo = false,
$is_simulate = false)
{
if (! ReflectionTypeHint::isValid()) return false;
$dh = opendir($dir);
if (! is_resource($dh)) return false;
$counter = 0;
while (($name = readdir($dh)) !== false)
{
if ($name == '.' || $name == '..') continue;
$file = $dir . '/' . $name;
if (is_file($file))
{
if (is_string($files_re) && ! preg_match($files_re, $name)) continue;
if ($is_echo) echo $file;
$s = @file_get_contents($file);
if (! is_string($s))
{
if ($is_echo) echo ' Error to reading' . PHP_EOL;
return false;
}
if (self::is_utf8($s))
{
if ($is_echo) echo ' UTF-8' . PHP_EOL;
continue;
}
$s = self::_convert($s, $charset, 'UTF-8');
#игнорируем ошибки при попытке перекодировать бинарные файлы
if (! is_string($s) || ! self::is_utf8($s))
{
if ($is_echo) echo ' Binary' . PHP_EOL;
continue;
}
$ext = strtolower(pathinfo($name, PATHINFO_EXTENSION));
if ($ext === 'htm' || $ext === 'html' || $ext === 'xhtml' || $ext === 'phtml' || $ext === 'tpl')
{
$s = preg_replace('~(<meta .+? content="text/html; [x00-x20]+ charset=) #1
[-a-zA-Zd]+
(" [^>]* >) #2
~sixSX', '$1utf-8$2', $s);
}
if ($ext === 'xml' || $ext === 'xsl' || $ext === 'tpl')
{
$s = preg_replace('~(<?xml .+? encoding=") #1
[-a-zA-Zd]+
(" .*? ?>) #2
~sixSX', '$1utf-8$2', $s);
}
if (! $is_simulate)
{
$bytes = @file_put_contents($file, $s);
if ($bytes === false)
{
if ($is_echo) echo ' Error to writing' . PHP_EOL;
return false;
}
}
if ($is_echo) echo ' ' . $charset . ' -> UTF-8' . PHP_EOL;
$counter++;
}
elseif ($is_recursive && is_dir($file))
{
if (! is_string($dirs_ignore_re) || ! preg_match($dirs_ignore_re, $name))
{
$c = self::convert_files_from($file, $files_re, $is_recursive, $charset, $dirs_ignore_re, $is_echo, $is_simulate);
if ($c === false) return false;
$counter += $c;
}
}
}
closedir($dh);
return $counter;
}
/**
*
* @param int|string $low
* @param int|string $high
* @param int $step
* @return array|bool Returns FALSE if error occurred
*/
public static function range($low, $high, $step = 1)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_int($low) || is_int($high)) return range($low, $high, $step); #speed improve
$low_cp = self::ord($low);
$high_cp = self::ord($high);
if ($low_cp === false || $high_cp === false) return false;
$a = range($low_cp, $high_cp, $step);
return array_map(array('self', 'chr'), $a);
}
/**
*
* @param string|null $s
* @param string|array $from
* @param string|null $to
* @return string|bool|null Returns FALSE if error occurred
*/
public static function strtr($s, $from, $to = null)
{
if (! ReflectionTypeHint::isValid()) return false;
if (is_null($s)) return $s;
if (is_array($from)) return strtr($s, $from); #speed improve
$keys = self::str_split($from);
$values = self::str_split($to);
if ($keys === false || $values === false) return false;
$table = array_combine($keys, $values);
if (! is_array($table)) return false;
return strtr($s, $table);
}
public static function tests()
{
assert_options(ASSERT_ACTIVE, true);
assert_options(ASSERT_BAIL, true);
assert_options(ASSERT_WARNING, true);
assert_options(ASSERT_QUIET_EVAL, false);
$a = array(
'self::html_entity_decode(""&<>", true) === ""&<>"',
'self::html_entity_decode(""&<>", false) === ""&<>"',
'self::html_entity_decode("&amp;", true) === "&"',
'self::html_entity_decode("&amp;", false) === "&amp;"',
'self::html_entity_decode(""", true) === """',
'self::html_entity_decode(""", false) === """',
'self::html_entity_decode("'", true) === "'"',
'self::html_entity_decode("'", false) === "'"',
'self::html_entity_decode(""", true) === """',
'self::html_entity_decode(""", false) === """',
'self::array_change_key_case(array("АБВГД" => "АБВГД"), CASE_LOWER) === array("абвгд" => "АБВГД")',
'self::array_change_key_case(array("абвгд" => "абвгд"), CASE_UPPER) === array("АБВГД" => "абвгд")',
'self::blocks_check("Яндекс", "Cyrillic") === true',
'self::blocks_check("Google", "Basic Latin") === true',
'self::blocks_check("Google & Яндекс", array("Basic Latin", "Cyrillic")) === true',
'self::blocks_check("Ё-моё, Yandex!", array(array(0x20, 0x7E), #[x20-x7E]
array(0x0410, 0x044F), #[A-Яa-я]
0x0401, #russian yo (Ё)
0x0451, #russian ye (ё)
)) === true',
'self::chunk_split("абвг", 2) === "абrnвг"',
'self::chunk_split("абвг", 2, "|") === "аб|вг"',
'self::lowercase("1234-ABCD-АБВГ") === "1234-abcd-абвг"',
'self::lowercase(array("1234-ABCD-АБВГ" => "1234-ABCD-АБВГ")) === array("1234-ABCD-АБВГ" => "1234-abcd-абвг")',
'self::uppercase("1234-abcd-абвг") === "1234-ABCD-АБВГ"',
'self::uppercase(array("1234-abcd-абвг" => "1234-abcd-абвг")) === array("1234-abcd-абвг" => "1234-ABCD-АБВГ")',
'self::convert_from(self::convert_to("123-ABC-abc-АБВ-абв", $charset = "cp1251"), $charset = "cp1251") === "123-ABC-abc-АБВ-абв"',
'self::diactrical_remove("вдохxc2xadноxc2xadвеxccx81xc2xadние") === "вдохxc2xadноxc2xadвеxc2xadние"',
'self::diactrical_remove("вдохxc2xadноxc2xadвеxccx81xc2xadние", array("xc2xad")) === "вдохновение"',
'self::diactrical_remove("вдохxc2xadноxc2xadвеxccx81xc2xadние", array("xc2xad"), true, $restore_table) === "вдохновение"',
'self::diactrical_restore("вдохновение", $restore_table) === "вдохxc2xadноxc2xadвеxccx81xc2xadние"',
'self::is_utf8(file_get_contents(' . var_export(__FILE__, true) . ', true)) === true',
'self::is_utf8(file_get_contents(' . var_export(__FILE__, true) . ', false)) === true',
'self::is_ascii(file_get_contents(' . var_export(__FILE__, true) . ')) === false',
#range() uses ord() and chr()
'self::range("A", "D") === array("A", "B", "C", "D")',
'self::range("а", "г") === array("а", "б", "в", "г")',
'self::range(1, 3) === array(1, 2, 3)',
'"↔" === self::chr(self::ord("↔"))',
'"123-ABC-abc-АБВ-абв" === self::from_unicode(self::to_unicode("123-ABC-abc-АБВ-абв"))',
'self::strpos("123-ABC-abc-абв-АБВ-где", "АБВ") === 16',
'self::stripos("123-ABC-abc-абд-АБВ-где", "абв") === 16',
'self::strpos("123-ABC-abc", "АБВ") === false',
'self::strpos("123-АБВ-абв", "abc") === false',
'self::preg_quote_case_insensitive("123_слово_test") === "123_(с|С)(л|Л)(о|О)(в|В)(о|О)_[tT][eE][sS][tT]"',
'self::preg_quote_case_insensitive("123_test") === "(?i:123_test)"',
//'self::strlen(file_get_contents(' . var_export(__FILE__, true) . ', true))'
);
foreach ($a as $k => $v) if (! assert($v)) return false;
//$start_time = microtime(true);
//$s = file_get_contents(__FILE__);
//for ($i = 0; $i < 10; $i++) $r = self::html_entity_encode($s);
//$time = microtime(true) - $start_time;
//d($time, $r);
return true;
}
}