Файл: core/classes/class.html2text.php
Строк: 170
* Copyright (c) 2003 Jose Solorzano. All rights reserved.
* Redistribution of source must retain this copyright notice.
* Class Html2Text. (HtmlParser example.)
* Converts HTML to ASCII attempting to preserve
* document structure.
* To use, create an instance of Html2Text passing
* the text to convert and the desired maximum
* number of characters per line. Then invoke
* convert() which returns ASCII text.
class Html2Text {
// Private fields
var $iCurrentLine = "";
var $iCurrentWord = "";
var $iCurrentWordArray;
var $iCurrentWordIndex;
var $iInScript;
var $iListLevel = 0;
var $iHtmlText;
var $iMaxColumns;
var $iHtmlParser;
// Constants
var $TOKEN_BR = 0;
var $TOKEN_P = 1;
var $TOKEN_LI = 2;
var $TOKEN_UL = 4;
var $TOKEN_ENDUL = 5;
function Html2Text ($aHtmlText, $aMaxColumns) {
$this->iHtmlText = $aHtmlText;
$this->iMaxColumns = $aMaxColumns;
function convert() {
$this->iHtmlParser = new HtmlParser($this->iHtmlText);
$wholeText = "";
while (($line = $this->getLine()) !== false) {
$wholeText .= ($line . "rn");
return $wholeText;
function getLine() {
while (true) {
if (!$this->addWordToLine($this->iCurrentWord)) {
$retvalue = $this->iCurrentLine;
$this->iCurrentLine = "";
return $retvalue;
$word = $this->getWord();
if ($word === false) {
if ($this->iCurrentLine == "") {
$retvalue = $this->iCurrentLine;
$this->iCurrentLine = "";
$this->iInText = false;
$this->iCurrentWord = "";
return $retvalue;
return false;
function addWordToLine ($word) {
if ($this->iInScript) {
return true;
$prevLine = $this->iCurrentLine;
if ($word === $this->TOKEN_BR) {
$this->iCurrentWord = "";
return false;
if ($word === $this->TOKEN_P) {
$this->iCurrentWord = $this->TOKEN_BR;
return false;
if ($word === $this->TOKEN_UL) {
$this->iCurrentWord = $this->TOKEN_BR;
return false;
if ($word === $this->TOKEN_ENDUL) {
$this->iCurrentWord = $this->TOKEN_BR;
return false;
if ($word === $this->TOKEN_LI) {
$this->iCurrentWord = $this->TOKEN_AFTERLI;
return false;
$toAdd = $word;
if ($word === $this->TOKEN_AFTERLI) {
$toAdd = "";
if ($prevLine != "") {
$prevLine .= " ";
else {
$prevLine = $this->getIndentation($word === $this->TOKEN_AFTERLI);
$candidateLine = $prevLine . $toAdd;
if (strlen ($candidateLine) > $this->iMaxColumns && $prevLine != "") {
return false;
$this->iCurrentLine = $candidateLine;
return true;
function getWord() {
while (true) {
if ($this->iHtmlParser->iNodeType == NODE_TYPE_TEXT) {
if (!$this->iInText) {
$words = $this->splitWords($this->iHtmlParser->iNodeValue);
$this->iCurrentWordArray = $words;
$this->iCurrentWordIndex = 0;
$this->iInText = true;
if ($this->iCurrentWordIndex < count($this->iCurrentWordArray)) {
$this->iCurrentWord = $this->iCurrentWordArray[$this->iCurrentWordIndex++];
return $this->iCurrentWord;
else {
$this->iInText = false;
else if ($this->iHtmlParser->iNodeType == NODE_TYPE_ELEMENT) {
if (strcasecmp ($this->iHtmlParser->iNodeName, "br") == 0) {
$this->iCurrentWord = $this->TOKEN_BR;
return $this->iCurrentWord;
else if (strcasecmp ($this->iHtmlParser->iNodeName, "p") == 0) {
$this->iCurrentWord = $this->TOKEN_P;
return $this->iCurrentWord;
else if (strcasecmp ($this->iHtmlParser->iNodeName, "script") == 0) {
$this->iCurrentWord = "";
$this->iInScript = true;
return $this->iCurrentWord;
else if (strcasecmp ($this->iHtmlParser->iNodeName, "ul") == 0 || strcasecmp ($this->iHtmlParser->iNodeName, "ol") == 0) {
$this->iCurrentWord = $this->TOKEN_UL;
return $this->iCurrentWord;
else if (strcasecmp ($this->iHtmlParser->iNodeName, "li") == 0) {
$this->iCurrentWord = $this->TOKEN_LI;
return $this->iCurrentWord;
else if ($this->iHtmlParser->iNodeType == NODE_TYPE_ENDELEMENT) {
if (strcasecmp ($this->iHtmlParser->iNodeName, "script") == 0) {
$this->iCurrentWord = "";
$this->iInScript = false;
return $this->iCurrentWord;
else if (strcasecmp ($this->iHtmlParser->iNodeName, "ul") == 0 || strcasecmp ($this->iHtmlParser->iNodeName, "ol") == 0) {
$this->iCurrentWord = $this->TOKEN_ENDUL;
if ($this->iListLevel > 0) {
return $this->iCurrentWord;
if (!$this->iHtmlParser->parse()) {
return false;
function splitWords ($text) {
$words = split ("[ trn]+", $text);
for ($idx = 0; $idx < count($words); $idx++) {
$words[$idx] = $this->htmlDecode($words[$idx]);
return $words;
function htmlDecode ($text) {
// TBD
return $text;
function getIndentation ($hasLI) {
$indent = "";
$idx = 0;
for ($idx = 0; $idx < ($this->iListLevel - 1); $idx++) {
$indent .= " ";
if ($this->iListLevel > 0) {
$indent = $hasLI ? ($indent . "- ") : ($indent . " ");
return $indent;