* This file is part of the zero package.
* Copyright (c) 2012 olamedia <olamedia@gmail.com>
* This source code is release under the MIT License.
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
* Simple HTML parser
* @author olamedia <olamedia@gmail.com>
class nokogiri implements IteratorAggregate{
regexp =
protected $_source = '';
* @var DOMDocument
protected $_dom = null;
* @var DOMDocument
protected $_tempDom = null;
* @var DOMXpath
* */
protected $_xpath = null;
protected static $_compiledXpath = array();
public function __construct($htmlString = ''){
public function getRegexp(){
$tag = "(?P<tag>[a-z0-9]+)?";
$attr = "([(?P<attr>S+)=(?P<value>[^]]+)])?";
$id = "(#(?P<id>[^s:>#.]+))?";
$class = "(.(?P<class>[^s:>#.]+))?";
$child = "(first|last|nth)-child";
$expr = "(((?P<expr>[^)]+)))";
$pseudo = "(:(?P<pseudo>".$child.")".$expr."?)?";
$rel = "s*(?P<rel>>)?";
$regexp = "/".$tag.$attr.$id.$class.$pseudo.$rel."/isS";
return $regexp;
public static function fromHtml($htmlString){
$me = new self();
return $me;
public static function fromHtmlNoCharset($htmlString){
$me = new self();
return $me;
public static function fromDom($dom){
$me = new self();
return $me;
public function loadDom($dom){
$this->_dom = $dom;
public function loadHtmlNoCharset($htmlString = ''){
$dom = new DOMDocument('1.0', 'UTF-8');
$dom->preserveWhiteSpace = false;
if (strlen($htmlString)){
$dom->loadHTML('<?xml encoding="UTF-8">'.$htmlString);
// dirty fix
foreach ($dom->childNodes as $item){
if ($item->nodeType == XML_PI_NODE){
$dom->removeChild($item); // remove hack
$dom->encoding = 'UTF-8'; // insert proper
public function loadHtml($htmlString = ''){
$dom = new DOMDocument('1.0', 'UTF-8');
$dom->preserveWhiteSpace = false;
if (strlen($htmlString)){
function __invoke($expression){
return $this->get($expression);
public function get($expression, $compile = true){
/*if (strpos($expression, ' ') !== false){
$a = explode(' ', $expression);
foreach ($a as $k=>$sub){
$a[$k] = $this->getXpathSubquery($sub);
return $this->getElements(implode('', $a));
return $this->getElements($this->getXpathSubquery($expression, false, $compile));
protected function getNodes(){
public function getDom(){
if ($this->_dom instanceof DOMDocument){
return $this->_dom;
}elseif ($this->_dom instanceof DOMNodeList){
if ($this->_tempDom === null){
$this->_tempDom = new DOMDocument('1.0', 'UTF-8');
$root = $this->_tempDom->createElement('root');
foreach ($this->_dom as $domElement){
$domNode = $this->_tempDom->importNode($domElement, true);
return $this->_tempDom;
protected function getXpath(){
if ($this->_xpath === null){
$this->_xpath = new DOMXpath($this->getDom());
return $this->_xpath;
public function getXpathSubquery($expression, $rel = false, $compile = true){
if ($compile){
$key = $expression.($rel?'>':'*');
if (isset(self::$_compiledXpath[$key])){
return self::$_compiledXpath[$key];
$query = '';
if (preg_match(self::regexp, $expression, $subs)){
$brackets = array();
if (isset($subs['id']) && '' !== $subs['id']){
$brackets[] = "@id='".$subs['id']."'";
if (isset($subs['attr']) && '' !== $subs['attr']){
$attrValue = isset($subs['value']) && !empty($subs['value'])?$subs['value']:'';
$brackets[] = "@".$subs['attr']."='".$attrValue."'";
if (isset($subs['class']) && '' !== $subs['class']){
$brackets[] = 'contains(concat(" ", normalize-space(@class), " "), " '.$subs['class'].' ")';
if (isset($subs['pseudo']) && '' !== $subs['pseudo']){
if ('first-child' === $subs['pseudo']){
$brackets[] = '1';
}elseif ('last-child' === $subs['pseudo']){
$brackets[] = 'last()';
}elseif ('nth-child' === $subs['pseudo']){
if (isset($subs['expr']) && '' !== $subs['expr']){
$e = $subs['expr'];
if('odd' === $e){
$brackets[] = '(position() -1) mod 2 = 0 and position() >= 1';
}elseif('even' === $e){
$brackets[] = 'position() mod 2 = 0 and position() >= 0';
}elseif(preg_match("/^((?P<mul>[0-9]+)n+)(?P<pos>[0-9]+)$/is", $e, $esubs)){
if (isset($esubs['mul'])){
$brackets[] = '(position() -'.$esubs['pos'].') mod '.$esubs['mul'].' = 0 and position() >= '.$esubs['pos'].'';
$brackets[] = ''.$e.'';
$query = ($rel?'/':'//').
((isset($subs['tag']) && '' !== $subs['tag'])?$subs['tag']:'*').
(($c = count($brackets))?
($c>1?'[('.implode(') and (', $brackets).')]':'['.implode(' and ', $brackets).']')
$left = trim(substr($expression, strlen($subs[0])));
if ('' !== $left){
$query .= $this->getXpathSubquery($left, isset($subs['rel'])?'>'===$subs['rel']:false, $compile);
if ($compile){
self::$_compiledXpath[$key] = $query;
return $query;
protected function getElements($xpathQuery){
if (strlen($xpathQuery)){
$nodeList = $this->getXpath()->query($xpathQuery);
if ($nodeList === false){
throw new Exception('Malformed xpath');
return self::fromDom($nodeList);
public function toXml(){
return $this->getDom()->saveXML();
public function toArray($xnode = null){
$array = array();
if ($xnode === null){
if ($this->_dom instanceof DOMNodeList){
foreach ($this->_dom as $node){
$array[] = $this->toArray($node);
return $array;
$node = $this->getDom();
$node = $xnode;
if (in_array($node->nodeType, array(XML_TEXT_NODE,XML_COMMENT_NODE))){
return $node->nodeValue;
if ($node->hasAttributes()){
foreach ($node->attributes as $attr){
$array[$attr->nodeName] = $attr->nodeValue;
if ($node->hasChildNodes()){
foreach ($node->childNodes as $childNode){
$array[$childNode->nodeName][] = $this->toArray($childNode);
if ($xnode === null){
return reset(reset($array)); // first child
return $array;
public function getIterator(){
$a = $this->toArray();
return new ArrayIterator($a);