Файл: utf8_html_entity_decode.php
Строк: 374
<?php
/**
* Convert all HTML entities to UTF-8 characters
* Функция декодирует гораздо больше именованных сущностей, чем стандартная html_entity_decode()
* Все dec и hex сущности так же переводятся в UTF-8.
*
* @param string $s
* @param bool $is_htmlspecialchars обрабатывать специальные html сущности? (< > & ")
* @return string
* @link http://www.htmlhelp.com/reference/html40/entities/
* @link http://www.alanwood.net/demos/ent4_frame.html (HTML 4.01 Character Entity References)
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset1.asp?frame=true
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset2.asp?frame=true
* @link http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset3.asp?frame=true
*
* @author Nasibullin Rinat <n a s i b u l l i n at starlink ru>
* @charset ANSI
* @version 2.1.12
*/
function utf8_html_entity_decode($s, $is_htmlspecialchars = false)
{
#оптимизация скорости
if (strlen($s) < 4 #по минимальной длине сущности - 4 байта: &#d; &xx;
|| ($pos = strpos($s, '&') === false) || strpos($s, ';', $pos) === false) return $s;
$table = array(
#Latin-1 Entities:
' ' => "xc2xa0", #no-break space = non-breaking space
'¡' => "xc2xa1", #inverted exclamation mark
'¢' => "xc2xa2", #cent sign
'£' => "xc2xa3", #pound sign
'¤' => "xc2xa4", #currency sign
'¥' => "xc2xa5", #yen sign = yuan sign
'¦' => "xc2xa6", #broken bar = broken vertical bar
'§' => "xc2xa7", #section sign
'¨' => "xc2xa8", #diaeresis = spacing diaeresis
'©' => "xc2xa9", #copyright sign
'ª' => "xc2xaa", #feminine ordinal indicator
'«' => "xc2xab", #left-pointing double angle quotation mark = left pointing guillemet («)
'¬' => "xc2xac", #not sign
'­' => "xc2xad", #soft hyphen = discretionary hyphen
'®' => "xc2xae", #registered sign = registered trade mark sign
'¯' => "xc2xaf", #macron = spacing macron = overline = APL overbar
'°' => "xc2xb0", #degree sign
'±' => "xc2xb1", #plus-minus sign = plus-or-minus sign
'²' => "xc2xb2", #superscript two = superscript digit two = squared
'³' => "xc2xb3", #superscript three = superscript digit three = cubed
'´' => "xc2xb4", #acute accent = spacing acute
'µ' => "xc2xb5", #micro sign
'¶' => "xc2xb6", #pilcrow sign = paragraph sign
'·' => "xc2xb7", #middle dot = Georgian comma = Greek middle dot
'¸' => "xc2xb8", #cedilla = spacing cedilla
'¹' => "xc2xb9", #superscript one = superscript digit one
'º' => "xc2xba", #masculine ordinal indicator
'»' => "xc2xbb", #right-pointing double angle quotation mark = right pointing guillemet (»)
'¼' => "xc2xbc", #vulgar fraction one quarter = fraction one quarter
'½' => "xc2xbd", #vulgar fraction one half = fraction one half
'¾' => "xc2xbe", #vulgar fraction three quarters = fraction three quarters
'¿' => "xc2xbf", #inverted question mark = turned question mark
#Latin capital letter
'À' => "xc3x80", #Latin capital letter A with grave = Latin capital letter A grave
'Á' => "xc3x81", #Latin capital letter A with acute
'Â' => "xc3x82", #Latin capital letter A with circumflex
'Ã' => "xc3x83", #Latin capital letter A with tilde
'Ä' => "xc3x84", #Latin capital letter A with diaeresis
'Å' => "xc3x85", #Latin capital letter A with ring above = Latin capital letter A ring
'Æ' => "xc3x86", #Latin capital letter AE = Latin capital ligature AE
'Ç' => "xc3x87", #Latin capital letter C with cedilla
'È' => "xc3x88", #Latin capital letter E with grave
'É' => "xc3x89", #Latin capital letter E with acute
'Ê' => "xc3x8a", #Latin capital letter E with circumflex
'Ë' => "xc3x8b", #Latin capital letter E with diaeresis
'Ì' => "xc3x8c", #Latin capital letter I with grave
'Í' => "xc3x8d", #Latin capital letter I with acute
'Î' => "xc3x8e", #Latin capital letter I with circumflex
'Ï' => "xc3x8f", #Latin capital letter I with diaeresis
'Ð' => "xc3x90", #Latin capital letter ETH
'Ñ' => "xc3x91", #Latin capital letter N with tilde
'Ò' => "xc3x92", #Latin capital letter O with grave
'Ó' => "xc3x93", #Latin capital letter O with acute
'Ô' => "xc3x94", #Latin capital letter O with circumflex
'Õ' => "xc3x95", #Latin capital letter O with tilde
'Ö' => "xc3x96", #Latin capital letter O with diaeresis
'×' => "xc3x97", #multiplication sign
'Ø' => "xc3x98", #Latin capital letter O with stroke = Latin capital letter O slash
'Ù' => "xc3x99", #Latin capital letter U with grave
'Ú' => "xc3x9a", #Latin capital letter U with acute
'Û' => "xc3x9b", #Latin capital letter U with circumflex
'Ü' => "xc3x9c", #Latin capital letter U with diaeresis
'Ý' => "xc3x9d", #Latin capital letter Y with acute
'Þ' => "xc3x9e", #Latin capital letter THORN
#Latin small letter
'ß' => "xc3x9f", #Latin small letter sharp s = ess-zed
'à' => "xc3xa0", #Latin small letter a with grave = Latin small letter a grave
'á' => "xc3xa1", #Latin small letter a with acute
'â' => "xc3xa2", #Latin small letter a with circumflex
'ã' => "xc3xa3", #Latin small letter a with tilde
'ä' => "xc3xa4", #Latin small letter a with diaeresis
'å' => "xc3xa5", #Latin small letter a with ring above = Latin small letter a ring
'æ' => "xc3xa6", #Latin small letter ae = Latin small ligature ae
'ç' => "xc3xa7", #Latin small letter c with cedilla
'è' => "xc3xa8", #Latin small letter e with grave
'é' => "xc3xa9", #Latin small letter e with acute
'ê' => "xc3xaa", #Latin small letter e with circumflex
'ë' => "xc3xab", #Latin small letter e with diaeresis
'ì' => "xc3xac", #Latin small letter i with grave
'í' => "xc3xad", #Latin small letter i with acute
'î' => "xc3xae", #Latin small letter i with circumflex
'ï' => "xc3xaf", #Latin small letter i with diaeresis
'ð' => "xc3xb0", #Latin small letter eth
'ñ' => "xc3xb1", #Latin small letter n with tilde
'ò' => "xc3xb2", #Latin small letter o with grave
'ó' => "xc3xb3", #Latin small letter o with acute
'ô' => "xc3xb4", #Latin small letter o with circumflex
'õ' => "xc3xb5", #Latin small letter o with tilde
'ö' => "xc3xb6", #Latin small letter o with diaeresis
'÷' => "xc3xb7", #division sign
'ø' => "xc3xb8", #Latin small letter o with stroke = Latin small letter o slash
'ù' => "xc3xb9", #Latin small letter u with grave
'ú' => "xc3xba", #Latin small letter u with acute
'û' => "xc3xbb", #Latin small letter u with circumflex
'ü' => "xc3xbc", #Latin small letter u with diaeresis
'ý' => "xc3xbd", #Latin small letter y with acute
'þ' => "xc3xbe", #Latin small letter thorn
'ÿ' => "xc3xbf", #Latin small letter y with diaeresis
#Symbols and Greek Letters:
'ƒ' => "xc6x92", #Latin small f with hook = function = florin
'Α' => "xcex91", #Greek capital letter alpha
'Β' => "xcex92", #Greek capital letter beta
'Γ' => "xcex93", #Greek capital letter gamma
'Δ' => "xcex94", #Greek capital letter delta
'Ε' => "xcex95", #Greek capital letter epsilon
'Ζ' => "xcex96", #Greek capital letter zeta
'Η' => "xcex97", #Greek capital letter eta
'Θ' => "xcex98", #Greek capital letter theta
'Ι' => "xcex99", #Greek capital letter iota
'Κ' => "xcex9a", #Greek capital letter kappa
'Λ' => "xcex9b", #Greek capital letter lambda
'Μ' => "xcex9c", #Greek capital letter mu
'Ν' => "xcex9d", #Greek capital letter nu
'Ξ' => "xcex9e", #Greek capital letter xi
'Ο' => "xcex9f", #Greek capital letter omicron
'Π' => "xcexa0", #Greek capital letter pi
'Ρ' => "xcexa1", #Greek capital letter rho
'Σ' => "xcexa3", #Greek capital letter sigma
'Τ' => "xcexa4", #Greek capital letter tau
'Υ' => "xcexa5", #Greek capital letter upsilon
'Φ' => "xcexa6", #Greek capital letter phi
'Χ' => "xcexa7", #Greek capital letter chi
'Ψ' => "xcexa8", #Greek capital letter psi
'Ω' => "xcexa9", #Greek capital letter omega
'α' => "xcexb1", #Greek small letter alpha
'β' => "xcexb2", #Greek small letter beta
'γ' => "xcexb3", #Greek small letter gamma
'δ' => "xcexb4", #Greek small letter delta
'ε' => "xcexb5", #Greek small letter epsilon
'ζ' => "xcexb6", #Greek small letter zeta
'η' => "xcexb7", #Greek small letter eta
'θ' => "xcexb8", #Greek small letter theta
'ι' => "xcexb9", #Greek small letter iota
'κ' => "xcexba", #Greek small letter kappa
'λ' => "xcexbb", #Greek small letter lambda
'μ' => "xcexbc", #Greek small letter mu
'ν' => "xcexbd", #Greek small letter nu
'ξ' => "xcexbe", #Greek small letter xi
'ο' => "xcexbf", #Greek small letter omicron
'π' => "xcfx80", #Greek small letter pi
'ρ' => "xcfx81", #Greek small letter rho
'ς' => "xcfx82", #Greek small letter final sigma
'σ' => "xcfx83", #Greek small letter sigma
'τ' => "xcfx84", #Greek small letter tau
'υ' => "xcfx85", #Greek small letter upsilon
'φ' => "xcfx86", #Greek small letter phi
'χ' => "xcfx87", #Greek small letter chi
'ψ' => "xcfx88", #Greek small letter psi
'ω' => "xcfx89", #Greek small letter omega
'ϑ'=> "xcfx91", #Greek small letter theta symbol
'ϒ' => "xcfx92", #Greek upsilon with hook symbol
'ϖ' => "xcfx96", #Greek pi symbol
'•' => "xe2x80xa2", #bullet = black small circle
'…' => "xe2x80xa6", #horizontal ellipsis = three dot leader
'′' => "xe2x80xb2", #prime = minutes = feet (для обозначения минут и футов)
'″' => "xe2x80xb3", #double prime = seconds = inches (для обозначения секунд и дюймов).
'‾' => "xe2x80xbe", #overline = spacing overscore
'⁄' => "xe2x81x84", #fraction slash
'℘' => "xe2x84x98", #script capital P = power set = Weierstrass p
'ℑ' => "xe2x84x91", #blackletter capital I = imaginary part
'ℜ' => "xe2x84x9c", #blackletter capital R = real part symbol
'™' => "xe2x84xa2", #trade mark sign
'ℵ' => "xe2x84xb5", #alef symbol = first transfinite cardinal
'←' => "xe2x86x90", #leftwards arrow
'↑' => "xe2x86x91", #upwards arrow
'→' => "xe2x86x92", #rightwards arrow
'↓' => "xe2x86x93", #downwards arrow
'↔' => "xe2x86x94", #left right arrow
'↵' => "xe2x86xb5", #downwards arrow with corner leftwards = carriage return
'⇐' => "xe2x87x90", #leftwards double arrow
'⇑' => "xe2x87x91", #upwards double arrow
'⇒' => "xe2x87x92", #rightwards double arrow
'⇓' => "xe2x87x93", #downwards double arrow
'⇔' => "xe2x87x94", #left right double arrow
'∀' => "xe2x88x80", #for all
'∂' => "xe2x88x82", #partial differential
'∃' => "xe2x88x83", #there exists
'∅' => "xe2x88x85", #empty set = null set = diameter
'∇' => "xe2x88x87", #nabla = backward difference
'∈' => "xe2x88x88", #element of
'∉' => "xe2x88x89", #not an element of
'∋' => "xe2x88x8b", #contains as member
'∏' => "xe2x88x8f", #n-ary product = product sign
'∑' => "xe2x88x91", #n-ary sumation
'−' => "xe2x88x92", #minus sign
'∗' => "xe2x88x97", #asterisk operator
'√' => "xe2x88x9a", #square root = radical sign
'∝' => "xe2x88x9d", #proportional to
'∞' => "xe2x88x9e", #infinity
'∠' => "xe2x88xa0", #angle
'∧' => "xe2x88xa7", #logical and = wedge
'∨' => "xe2x88xa8", #logical or = vee
'∩' => "xe2x88xa9", #intersection = cap
'∪' => "xe2x88xaa", #union = cup
'∫' => "xe2x88xab", #integral
'∴' => "xe2x88xb4", #therefore
'∼' => "xe2x88xbc", #tilde operator = varies with = similar to
'≅' => "xe2x89x85", #approximately equal to
'≈' => "xe2x89x88", #almost equal to = asymptotic to
'≠' => "xe2x89xa0", #not equal to
'≡' => "xe2x89xa1", #identical to
'≤' => "xe2x89xa4", #less-than or equal to
'≥' => "xe2x89xa5", #greater-than or equal to
'⊂' => "xe2x8ax82", #subset of
'⊃' => "xe2x8ax83", #superset of
'⊄' => "xe2x8ax84", #not a subset of
'⊆' => "xe2x8ax86", #subset of or equal to
'⊇' => "xe2x8ax87", #superset of or equal to
'⊕' => "xe2x8ax95", #circled plus = direct sum
'⊗' => "xe2x8ax97", #circled times = vector product
'⊥' => "xe2x8axa5", #up tack = orthogonal to = perpendicular
'⋅' => "xe2x8bx85", #dot operator
'⌈' => "xe2x8cx88", #left ceiling = APL upstile
'⌉' => "xe2x8cx89", #right ceiling
'⌊' => "xe2x8cx8a", #left floor = APL downstile
'⌋' => "xe2x8cx8b", #right floor
'⟨' => "xe2x8cxa9", #left-pointing angle bracket = bra
'⟩' => "xe2x8cxaa", #right-pointing angle bracket = ket
'◊' => "xe2x97x8a", #lozenge
'♠' => "xe2x99xa0", #black spade suit
'♣' => "xe2x99xa3", #black club suit = shamrock
'♥' => "xe2x99xa5", #black heart suit = valentine
'♦' => "xe2x99xa6", #black diamond suit
#Other Special Characters:
'Œ' => "xc5x92", #Latin capital ligature OE
'œ' => "xc5x93", #Latin small ligature oe
'Š' => "xc5xa0", #Latin capital letter S with caron
'š' => "xc5xa1", #Latin small letter s with caron
'Ÿ' => "xc5xb8", #Latin capital letter Y with diaeresis
'ˆ' => "xcbx86", #modifier letter circumflex accent
'˜' => "xcbx9c", #small tilde
' ' => "xe2x80x82", #en space
' ' => "xe2x80x83", #em space
' ' => "xe2x80x89", #thin space
'‌' => "xe2x80x8c", #zero width non-joiner
'‍' => "xe2x80x8d", #zero width joiner
'‎' => "xe2x80x8e", #left-to-right mark
'‏' => "xe2x80x8f", #right-to-left mark
'–' => "xe2x80x93", #en dash
'—' => "xe2x80x94", #em dash
'‘' => "xe2x80x98", #left single quotation mark
'’' => "xe2x80x99", #right single quotation mark (and apostrophe!)
'‚' => "xe2x80x9a", #single low-9 quotation mark
'“' => "xe2x80x9c", #left double quotation mark
'”' => "xe2x80x9d", #right double quotation mark
'„' => "xe2x80x9e", #double low-9 quotation mark
'†' => "xe2x80xa0", #dagger
'‡' => "xe2x80xa1", #double dagger
'‰' => "xe2x80xb0", #per mille sign
'‹' => "xe2x80xb9", #single left-pointing angle quotation mark
'›' => "xe2x80xba", #single right-pointing angle quotation mark
'€' => "xe2x82xac", #euro sign
);
$htmlspecialchars = array(
'"' => "x22", #quotation mark = APL quote (") "
'&' => "x26", #ampersand (&) &
'<' => "x3c", #less-than sign (<) <
'>' => "x3e", #greater-than sign (>) >
);
if ($is_htmlspecialchars) $table += $htmlspecialchars;
#заменяем именованные сущности:
#оптимизация скорости: заменяем только те сущности, которые используются в html коде!
#эта часть кода работает быстрее, чем $s = strtr($s, $table);
preg_match_all('/&[a-zA-Z]+d*;/s', $s, $m, null, $pos);
foreach (array_unique($m[0]) as $entity)
{
if (array_key_exists($entity, $table)) $s = str_replace($entity, $table[$entity], $s);
}#foreach
if (($pos = strpos($s, '&#')) !== false) #speed optimization
{
#заменяем числовые dec и hex сущности:
$htmlspecialchars_flip = array_flip($htmlspecialchars);
$s = preg_replace(
'/&#((x)[da-fA-F]{2,4}|d{1,4});/se',
'(array_key_exists($a = pack("C", $d = ("$2") ? hexdec("$1") : "$1"), $htmlspecialchars_flip) && ! $is_htmlspecialchars) ?
$htmlspecialchars_flip[$a] :
iconv("UCS-2BE", "UTF-8", pack("n", $d))', $s, -1, $pos);
}
return $s;
}
?>