Вход Регистрация
Файл: libs/HTMLPurifier/HTMLPurifier/Lexer/DirectLex.php
Строк: 674
<?php

/**
 * Our in-house implementation of a parser.
 *
 * A pure PHP parser, DirectLex has absolutely no dependencies, making
 * it a reasonably good default for PHP4.  Written with efficiency in mind,
 * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
 * pales in comparison to HTMLPurifier_Lexer_DOMLex.
 *
 * @todo Reread XML spec and document differences.
 */
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
{

    public 
$tracksLineNumbers true;

    
/**
     * Whitespace characters for str(c)spn.
     */
    
protected $_whitespace "x20x09x0Dx0A";

    
/**
     * Callback function for script CDATA fudge
     * @param $matches, in form of array(opening tag, contents, closing tag)
     */
    
protected function scriptCallback($matches) {
        return 
$matches[1] . htmlspecialchars($matches[2], ENT_COMPAT'UTF-8') . $matches[3];
    }

    public function 
tokenizeHTML($html$config$context) {

        
// special normalization for script tags without any armor
        // our "armor" heurstic is a < sign any number of whitespaces after
        // the first script tag
        
if ($config->get('HTML.Trusted')) {
            
$html preg_replace_callback('#(<script[^>]*>)(s*[^<].+?)(</script>)#si',
                array(
$this'scriptCallback'), $html);
        }

        
$html $this->normalize($html$config$context);

        
$cursor 0// our location in the text
        
$inside_tag false// whether or not we're parsing the inside of a tag
        
$array = array(); // result array

        // This is also treated to mean maintain *column* numbers too
        
$maintain_line_numbers $config->get('Core.MaintainLineNumbers');

        if (
$maintain_line_numbers === null) {
            
// automatically determine line numbering by checking
            // if error collection is on
            
$maintain_line_numbers $config->get('Core.CollectErrors');
        }

        if (
$maintain_line_numbers) {
            
$current_line 1;
            
$current_col  0;
            
$length strlen($html);
        } else {
            
$current_line false;
            
$current_col  false;
            
$length false;
        }
        
$context->register('CurrentLine'$current_line);
        
$context->register('CurrentCol',  $current_col);
        
$nl "n";
        
// how often to manually recalculate. This will ALWAYS be right,
        // but it's pretty wasteful. Set to 0 to turn off
        
$synchronize_interval $config->get('Core.DirectLexLineNumberSyncInterval');

        
$e false;
        if (
$config->get('Core.CollectErrors')) {
            
$e =& $context->get('ErrorCollector');
        }

        
// for testing synchronization
        
$loops 0;

        while(++
$loops) {

            
// $cursor is either at the start of a token, or inside of
            // a tag (i.e. there was a < immediately before it), as indicated
            // by $inside_tag

            
if ($maintain_line_numbers) {

                
// $rcursor, however, is always at the start of a token.
                
$rcursor $cursor - (int) $inside_tag;

                
// Column number is cheap, so we calculate it every round.
                // We're interested at the *end* of the newline string, so
                // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
                // from our "rcursor" position.
                
$nl_pos strrpos($html$nl$rcursor $length);
                
$current_col $rcursor - (is_bool($nl_pos) ? $nl_pos 1);

                
// recalculate lines
                
if (
                    
$synchronize_interval &&  // synchronization is on
                    
$cursor &&            // cursor is further than zero
                    
$loops $synchronize_interval === // time to synchronize!
                
) {
                    
$current_line $this->substrCount($html$nl0$cursor);
                }

            }

            
$position_next_lt strpos($html'<'$cursor);
            
$position_next_gt strpos($html'>'$cursor);

            
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
            // special case to set up context
            
if ($position_next_lt === $cursor) {
                
$inside_tag true;
                
$cursor++;
            }

            if (!
$inside_tag && $position_next_lt !== false) {
                
// We are not inside tag and there still is another tag to parse
                
$token = new
                    
HTMLPurifier_Token_Text(
                        
$this->parseData(
                            
substr(
                                
$html$cursor$position_next_lt $cursor
                            
)
                        )
                    );
                if (
$maintain_line_numbers) {
                    
$token->rawPosition($current_line$current_col);
                    
$current_line += $this->substrCount($html$nl$cursor$position_next_lt $cursor);
                }
                
$array[] = $token;
                
$cursor  $position_next_lt 1;
                
$inside_tag true;
                continue;
            } elseif (!
$inside_tag) {
                
// We are not inside tag but there are no more tags
                // If we're already at the end, break
                
if ($cursor === strlen($html)) break;
                
// Create Text of rest of string
                
$token = new
                    
HTMLPurifier_Token_Text(
                        
$this->parseData(
                            
substr(
                                
$html$cursor
                            
)
                        )
                    );
                if (
$maintain_line_numbers$token->rawPosition($current_line$current_col);
                
$array[] = $token;
                break;
            } elseif (
$inside_tag && $position_next_gt !== false) {
                
// We are in tag and it is well formed
                // Grab the internals of the tag
                
$strlen_segment $position_next_gt $cursor;

                if (
$strlen_segment 1) {
                    
// there's nothing to process!
                    
$token = new HTMLPurifier_Token_Text('<');
                    
$cursor++;
                    continue;
                }

                
$segment substr($html$cursor$strlen_segment);

                if (
$segment === false) {
                    
// somehow, we attempted to access beyond the end of
                    // the string, defense-in-depth, reported by Nate Abele
                    
break;
                }

                
// Check if it's a comment
                
if (
                    
substr($segment03) === '!--'
                
) {
                    
// re-determine segment length, looking for -->
                    
$position_comment_end strpos($html'-->'$cursor);
                    if (
$position_comment_end === false) {
                        
// uh oh, we have a comment that extends to
                        // infinity. Can't be helped: set comment
                        // end position to end of string
                        
if ($e$e->send(E_WARNING'Lexer: Unclosed comment');
                        
$position_comment_end strlen($html);
                        
$end true;
                    } else {
                        
$end false;
                    }
                    
$strlen_segment $position_comment_end $cursor;
                    
$segment substr($html$cursor$strlen_segment);
                    
$token = new
                        
HTMLPurifier_Token_Comment(
                            
substr(
                                
$segment3$strlen_segment 3
                            
)
                        );
                    if (
$maintain_line_numbers) {
                        
$token->rawPosition($current_line$current_col);
                        
$current_line += $this->substrCount($html$nl$cursor$strlen_segment);
                    }
                    
$array[] = $token;
                    
$cursor $end $position_comment_end $position_comment_end 3;
                    
$inside_tag false;
                    continue;
                }

                
// Check if it's an end tag
                
$is_end_tag = (strpos($segment,'/') === 0);
                if (
$is_end_tag) {
                    
$type substr($segment1);
                    
$token = new HTMLPurifier_Token_End($type);
                    if (
$maintain_line_numbers) {
                        
$token->rawPosition($current_line$current_col);
                        
$current_line += $this->substrCount($html$nl$cursor$position_next_gt $cursor);
                    }
                    
$array[] = $token;
                    
$inside_tag false;
                    
$cursor $position_next_gt 1;
                    continue;
                }

                
// Check leading character is alnum, if not, we may
                // have accidently grabbed an emoticon. Translate into
                // text and go our merry way
                
if (!ctype_alpha($segment[0])) {
                    
// XML:  $segment[0] !== '_' && $segment[0] !== ':'
                    
if ($e$e->send(E_NOTICE'Lexer: Unescaped lt');
                    
$token = new HTMLPurifier_Token_Text('<');
                    if (
$maintain_line_numbers) {
                        
$token->rawPosition($current_line$current_col);
                        
$current_line += $this->substrCount($html$nl$cursor$position_next_gt $cursor);
                    }
                    
$array[] = $token;
                    
$inside_tag false;
                    continue;
                }

                
// Check if it is explicitly self closing, if so, remove
                // trailing slash. Remember, we could have a tag like <br>, so
                // any later token processing scripts must convert improperly
                // classified EmptyTags from StartTags.
                
$is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
                if (
$is_self_closing) {
                    
$strlen_segment--;
                    
$segment substr($segment0$strlen_segment);
                }

                
// Check if there are any attributes
                
$position_first_space strcspn($segment$this->_whitespace);

                if (
$position_first_space >= $strlen_segment) {
                    if (
$is_self_closing) {
                        
$token = new HTMLPurifier_Token_Empty($segment);
                    } else {
                        
$token = new HTMLPurifier_Token_Start($segment);
                    }
                    if (
$maintain_line_numbers) {
                        
$token->rawPosition($current_line$current_col);
                        
$current_line += $this->substrCount($html$nl$cursor$position_next_gt $cursor);
                    }
                    
$array[] = $token;
                    
$inside_tag false;
                    
$cursor $position_next_gt 1;
                    continue;
                }

                
// Grab out all the data
                
$type substr($segment0$position_first_space);
                
$attribute_string =
                    
trim(
                        
substr(
                            
$segment$position_first_space
                        
)
                    );
                if (
$attribute_string) {
                    
$attr $this->parseAttributeString(
                                    
$attribute_string
                                  
$config$context
                              
);
                } else {
                    
$attr = array();
                }

                if (
$is_self_closing) {
                    
$token = new HTMLPurifier_Token_Empty($type$attr);
                } else {
                    
$token = new HTMLPurifier_Token_Start($type$attr);
                }
                if (
$maintain_line_numbers) {
                    
$token->rawPosition($current_line$current_col);
                    
$current_line += $this->substrCount($html$nl$cursor$position_next_gt $cursor);
                }
                
$array[] = $token;
                
$cursor $position_next_gt 1;
                
$inside_tag false;
                continue;
            } else {
                
// inside tag, but there's no ending > sign
                
if ($e$e->send(E_WARNING'Lexer: Missing gt');
                
$token = new
                    
HTMLPurifier_Token_Text(
                        
'<' .
                        
$this->parseData(
                            
substr($html$cursor)
                        )
                    );
                if (
$maintain_line_numbers$token->rawPosition($current_line$current_col);
                
// no cursor scroll? Hmm...
                
$array[] = $token;
                break;
            }
            break;
        }

        
$context->destroy('CurrentLine');
        
$context->destroy('CurrentCol');
        return 
$array;
    }

    
/**
     * PHP 5.0.x compatible substr_count that implements offset and length
     */
    
protected function substrCount($haystack$needle$offset$length) {
        static 
$oldVersion;
        if (
$oldVersion === null) {
            
$oldVersion version_compare(PHP_VERSION'5.1''<');
        }
        if (
$oldVersion) {
            
$haystack substr($haystack$offset$length);
            return 
substr_count($haystack$needle);
        } else {
            return 
substr_count($haystack$needle$offset$length);
        }
    }

    
/**
     * Takes the inside of an HTML tag and makes an assoc array of attributes.
     *
     * @param $string Inside of tag excluding name.
     * @returns Assoc array of attributes.
     */
    
public function parseAttributeString($string$config$context) {
        
$string = (string) $string// quick typecast

        
if ($string == '') return array(); // no attributes

        
$e false;
        if (
$config->get('Core.CollectErrors')) {
            
$e =& $context->get('ErrorCollector');
        }

        
// let's see if we can abort as quickly as possible
        // one equal sign, no spaces => one attribute
        
$num_equal substr_count($string'=');
        
$has_space strpos($string' ');
        if (
$num_equal === && !$has_space) {
            
// bool attribute
            
return array($string => $string);
        } elseif (
$num_equal === && !$has_space) {
            
// only one attribute
            
list($key$quoted_value) = explode('='$string);
            
$quoted_value trim($quoted_value);
            if (!
$key) {
                if (
$e$e->send(E_ERROR'Lexer: Missing attribute key');
                return array();
            }
            if (!
$quoted_value) return array($key => '');
            
$first_char = @$quoted_value[0];
            
$last_char  = @$quoted_value[strlen($quoted_value)-1];

            
$same_quote = ($first_char == $last_char);
            
$open_quote = ($first_char == '"' || $first_char == "'");

            if ( 
$same_quote && $open_quote) {
                
// well behaved
                
$value substr($quoted_value1strlen($quoted_value) - 2);
            } else {
                
// not well behaved
                
if ($open_quote) {
                    if (
$e$e->send(E_ERROR'Lexer: Missing end quote');
                    
$value substr($quoted_value1);
                } else {
                    
$value $quoted_value;
                }
            }
            if (
$value === false$value '';
            return array(
$key => $this->parseData($value));
        }

        
// setup loop environment
        
$array  = array(); // return assoc array of attributes
        
$cursor 0// current position in string (moves forward)
        
$size   strlen($string); // size of the string (stays the same)

        // if we have unquoted attributes, the parser expects a terminating
        // space, so let's guarantee that there's always a terminating space.
        
$string .= ' ';

        while(
true) {

            if (
$cursor >= $size) {
                break;
            }

            
$cursor += ($value strspn($string$this->_whitespace$cursor));
            
// grab the key

            
$key_begin $cursor//we're currently at the start of the key

            // scroll past all characters that are the key (not whitespace or =)
            
$cursor += strcspn($string$this->_whitespace '='$cursor);

            
$key_end $cursor// now at the end of the key

            
$key substr($string$key_begin$key_end $key_begin);

            if (!
$key) {
                if (
$e$e->send(E_ERROR'Lexer: Missing attribute key');
                
$cursor += strcspn($string$this->_whitespace$cursor 1); // prevent infinite loop
                
continue; // empty key
            
}

            
// scroll past all whitespace
            
$cursor += strspn($string$this->_whitespace$cursor);

            if (
$cursor >= $size) {
                
$array[$key] = $key;
                break;
            }

            
// if the next character is an equal sign, we've got a regular
            // pair, otherwise, it's a bool attribute
            
$first_char = @$string[$cursor];

            if (
$first_char == '=') {
                
// key="value"

                
$cursor++;
                
$cursor += strspn($string$this->_whitespace$cursor);

                if (
$cursor === false) {
                    
$array[$key] = '';
                    break;
                }

                
// we might be in front of a quote right now

                
$char = @$string[$cursor];

                if (
$char == '"' || $char == "'") {
                    
// it's quoted, end bound is $char
                    
$cursor++;
                    
$value_begin $cursor;
                    
$cursor strpos($string$char$cursor);
                    
$value_end $cursor;
                } else {
                    
// it's not quoted, end bound is whitespace
                    
$value_begin $cursor;
                    
$cursor += strcspn($string$this->_whitespace$cursor);
                    
$value_end $cursor;
                }

                
// we reached a premature end
                
if ($cursor === false) {
                    
$cursor $size;
                    
$value_end $cursor;
                }

                
$value substr($string$value_begin$value_end $value_begin);
                if (
$value === false$value '';
                
$array[$key] = $this->parseData($value);
                
$cursor++;

            } else {
                
// boolattr
                
if ($key !== '') {
                    
$array[$key] = $key;
                } else {
                    
// purely theoretical
                    
if ($e$e->send(E_ERROR'Lexer: Missing attribute key');
                }

            }
        }
        return 
$array;
    }

}

// vim: et sw=4 sts=4
Онлайн: 1
Реклама