Вход Регистрация
Файл: core/classes/class.html2text.php
Строк: 170
<?php

/*
 * Copyright (c) 2003 Jose Solorzano.  All rights reserved.
 * Redistribution of source must retain this copyright notice.
 */

/**
 * Class Html2Text. (HtmlParser example.)
 * Converts HTML to ASCII attempting to preserve
 * document structure.
 * To use, create an instance of Html2Text passing
 * the text to convert and the desired maximum
 * number of characters per line. Then invoke
 * convert() which returns ASCII text.
 */
class Html2Text {

    
// Private fields

    
var $iCurrentLine "";
    var 
$iCurrentWord "";
    var 
$iCurrentWordArray;
    var 
$iCurrentWordIndex;
    var 
$iInScript;
    var 
$iListLevel 0;
    var 
$iHtmlText;
    var 
$iMaxColumns;
    var 
$iHtmlParser;

    
// Constants

    
var $TOKEN_BR       0;
    var 
$TOKEN_P        1;
    var 
$TOKEN_LI       2;
    var 
$TOKEN_AFTERLI  3;
    var 
$TOKEN_UL       4;
    var 
$TOKEN_ENDUL    5;

    function 
Html2Text ($aHtmlText$aMaxColumns) {
        
$this->iHtmlText $aHtmlText;
        
$this->iMaxColumns $aMaxColumns;
    }

    function 
convert() {
        
$this->iHtmlParser = new HtmlParser($this->iHtmlText);
        
$wholeText "";
        while ((
$line $this->getLine()) !== false) {
            
$wholeText .= ($line "rn");
        }
        return 
$wholeText;
    }

    function 
getLine() {
        while (
true) {
            if (!
$this->addWordToLine($this->iCurrentWord)) {
                
$retvalue $this->iCurrentLine;
                
$this->iCurrentLine "";
                return 
$retvalue;
            }
            
$word $this->getWord();
            if (
$word === false) {
                if (
$this->iCurrentLine == "") {
                    break;
                }
                
$retvalue $this->iCurrentLine;
                
$this->iCurrentLine "";
                
$this->iInText false;
                
$this->iCurrentWord "";
                return 
$retvalue;
            }
        }
        return 
false;
    }

    function 
addWordToLine ($word) {
        if (
$this->iInScript) {
            return 
true;
        }
        
$prevLine $this->iCurrentLine;
        if (
$word === $this->TOKEN_BR) {
            
$this->iCurrentWord "";
            return 
false;
        }
        if (
$word === $this->TOKEN_P) {
            
$this->iCurrentWord $this->TOKEN_BR;
            return 
false;
        }
        if (
$word === $this->TOKEN_UL) {
            
$this->iCurrentWord $this->TOKEN_BR;
            return 
false;
        }
        if (
$word === $this->TOKEN_ENDUL) {
            
$this->iCurrentWord $this->TOKEN_BR;
            return 
false;
        }
        if (
$word === $this->TOKEN_LI) {
            
$this->iCurrentWord $this->TOKEN_AFTERLI;
            return 
false;
        }
        
$toAdd $word;
        if (
$word === $this->TOKEN_AFTERLI) {
            
$toAdd "";
        }
        if (
$prevLine != "") {
            
$prevLine .= " ";
        }
        else {
            
$prevLine $this->getIndentation($word === $this->TOKEN_AFTERLI);
        }
        
$candidateLine $prevLine $toAdd;
        if (
strlen ($candidateLine) > $this->iMaxColumns && $prevLine != "") {
            return 
false;
        }
        
$this->iCurrentLine $candidateLine;
        return 
true;
    }

    function 
getWord() {
        while (
true) {
            if (
$this->iHtmlParser->iNodeType == NODE_TYPE_TEXT) {
                if (!
$this->iInText) {
                    
$words $this->splitWords($this->iHtmlParser->iNodeValue);
                    
$this->iCurrentWordArray $words;
                    
$this->iCurrentWordIndex 0;
                    
$this->iInText true;
                }
                if (
$this->iCurrentWordIndex count($this->iCurrentWordArray)) {
                    
$this->iCurrentWord $this->iCurrentWordArray[$this->iCurrentWordIndex++];
                    return 
$this->iCurrentWord;
                }
                else {
                    
$this->iInText false;
                }
            }
            else if (
$this->iHtmlParser->iNodeType == NODE_TYPE_ELEMENT) {
                if (
strcasecmp ($this->iHtmlParser->iNodeName"br") == 0) {
                    
$this->iHtmlParser->parse();
                    
$this->iCurrentWord $this->TOKEN_BR;
                    return 
$this->iCurrentWord;
                }
                else if (
strcasecmp ($this->iHtmlParser->iNodeName"p") == 0) {
                    
$this->iHtmlParser->parse();
                    
$this->iCurrentWord $this->TOKEN_P;
                    return 
$this->iCurrentWord;
                }
                else if (
strcasecmp ($this->iHtmlParser->iNodeName"script") == 0) {
                    
$this->iHtmlParser->parse();
                    
$this->iCurrentWord "";
                    
$this->iInScript true;
                    return 
$this->iCurrentWord;
                }
                else if (
strcasecmp ($this->iHtmlParser->iNodeName"ul") == || strcasecmp ($this->iHtmlParser->iNodeName"ol") == 0) {
                    
$this->iHtmlParser->parse();
                    
$this->iCurrentWord $this->TOKEN_UL;
                    
$this->iListLevel++;
                    return 
$this->iCurrentWord;
                }
                else if (
strcasecmp ($this->iHtmlParser->iNodeName"li") == 0) {
                    
$this->iHtmlParser->parse();
                    
$this->iCurrentWord $this->TOKEN_LI;
                    return 
$this->iCurrentWord;
                }
            }
            else if (
$this->iHtmlParser->iNodeType == NODE_TYPE_ENDELEMENT) {
                if (
strcasecmp ($this->iHtmlParser->iNodeName"script") == 0) {
                    
$this->iHtmlParser->parse();
                    
$this->iCurrentWord "";
                    
$this->iInScript false;
                    return 
$this->iCurrentWord;
                }
                else if (
strcasecmp ($this->iHtmlParser->iNodeName"ul") == || strcasecmp ($this->iHtmlParser->iNodeName"ol") == 0) {
                    
$this->iHtmlParser->parse();
                    
$this->iCurrentWord $this->TOKEN_ENDUL;
                    if (
$this->iListLevel 0) {
                        
$this->iListLevel--;
                    }
                    return 
$this->iCurrentWord;
                }
            }
            if (!
$this->iHtmlParser->parse()) {
                break;
            }
        }
        return 
false;
    }

    function 
splitWords ($text) {
        
$words split ("[ trn]+"$text);
        for (
$idx 0$idx count($words); $idx++) {
            
$words[$idx] = $this->htmlDecode($words[$idx]);
        }
        return 
$words;
    }

    function 
htmlDecode ($text) {
        
// TBD
        
return $text;
    }

    function 
getIndentation ($hasLI) {
        
$indent "";
        
$idx 0;
        for (
$idx 0$idx < ($this->iListLevel 1); $idx++) {
            
$indent .= "  ";
        }
        if (
$this->iListLevel 0) {
            
$indent $hasLI ? ($indent "- ") : ($indent "  ");
        }
        return 
$indent;
    }
}
Онлайн: 2
Реклама