_already_parsed; $this->skip_whitespace(); if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } switch ( $this->html[ $this->bytes_already_parsed ] ) { case "'": case '"': $quote = $this->html[ $this->bytes_already_parsed ]; $value_start = $this->bytes_already_parsed + 1; $value_length = strcspn( $this->html, $quote, $value_start ); $attribute_end = $value_start + $value_length + 1; $this->bytes_already_parsed = $attribute_end; break; default: $value_start = $this->bytes_already_parsed; $value_length = strcspn( $this->html, "> \t\f\r\n", $value_start ); $attribute_end = $value_start + $value_length; $this->bytes_already_parsed = $attribute_end; } } else { $value_start = $this->bytes_already_parsed; $value_length = 0; $attribute_end = $attribute_start + $name_length; } if ( $attribute_end >= strlen( $this->html ) ) { $this->parser_state = self::STATE_INCOMPLETE_INPUT; return false; } if ( $this->is_closing_tag ) { return true; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $comparable_name = strtolower( $attribute_name ); // If an attribute is listed many times, only use the first declaration and ignore the rest. if ( ! array_key_exists( $comparable_name, $this->attributes ) ) { $this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token( $attribute_name, $value_start, $value_length, $attribute_start, $attribute_end - $attribute_start, ! $has_value ); return true; } /* * Track the duplicate attributes so if we remove it, all disappear together. * * While `$this->duplicated_attributes` could always be stored as an `array()`, * which would simplify the logic here, storing a `null` and only allocating * an array when encountering duplicates avoids needless allocations in the * normative case of parsing tags with no duplicate attributes. */ $duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start ); if ( null === $this->duplicate_attributes ) { $this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) ); } elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) { $this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span ); } else { $this->duplicate_attributes[ $comparable_name ][] = $duplicate_span; } return true; } /** * Move the internal cursor past any immediate successive whitespace. * * @since 6.2.0 */ private function skip_whitespace() { $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n", $this->bytes_already_parsed ); } /** * Applies attribute updates and cleans up once a tag is fully parsed. * * @since 6.2.0 */ private function after_tag() { /* * There could be lexical updates enqueued for an attribute that * also exists on the next tag. In order to avoid conflating the * attributes across the two tags, lexical updates with names * need to be flushed to raw lexical updates. */ $this->class_name_updates_to_attributes_updates(); /* * Purge updates if there are too many. The actual count isn't * scientific, but a few values from 100 to a few thousand were * tests to find a practially-useful limit. * * If the update queue grows too big, then the Tag Processor * will spend more time iterating through them and lose the * efficiency gains of deferring applying them. */ if ( 1000 < count( $this->lexical_updates ) ) { $this->get_updated_html(); } foreach ( $this->lexical_updates as $name => $update ) { /* * Any updates appearing after the cursor should be applied * before proceeding, otherwise they may be overlooked. */ if ( $update->start >= $this->bytes_already_parsed ) { $this->get_updated_html(); break; } if ( is_int( $name ) ) { continue; } $this->lexical_updates[] = $update; unset( $this->lexical_updates[ $name ] ); } $this->token_starts_at = null; $this->token_length = null; $this->tag_name_starts_at = null; $this->tag_name_length = null; $this->text_starts_at = 0; $this->text_length = 0; $this->is_closing_tag = null; $this->attributes = array(); $this->comment_type = null; $this->duplicate_attributes = null; } /** * Converts class name updates into tag attributes updates * (they are accumulated in different data formats for performance). * * @since 6.2.0 * * @see WP_HTML_Tag_Processor::$lexical_updates * @see WP_HTML_Tag_Processor::$classname_updates */ private function class_name_updates_to_attributes_updates() { if ( count( $this->classname_updates ) === 0 ) { return; } $existing_class = $this->get_enqueued_attribute_value( 'class' ); if ( null === $existing_class || true === $existing_class ) { $existing_class = ''; } if ( false === $existing_class && isset( $this->attributes['class'] ) ) { $existing_class = substr( $this->html, $this->attributes['class']->value_starts_at, $this->attributes['class']->value_length ); } if ( false === $existing_class ) { $existing_class = ''; } /** * Updated "class" attribute value. * * This is incrementally built while scanning through the existing class * attribute, skipping removed classes on the way, and then appending * added classes at the end. Only when finished processing will the * value contain the final new value. * @var string $class */ $class = ''; /** * Tracks the cursor position in the existing * class attribute value while parsing. * * @var int $at */ $at = 0; /** * Indicates if there's any need to modify the existing class attribute. * * If a call to `add_class()` and `remove_class()` wouldn't impact * the `class` attribute value then there's no need to rebuild it. * For example, when adding a class that's already present or * removing one that isn't. * * This flag enables a performance optimization when none of the enqueued * class updates would impact the `class` attribute; namely, that the * processor can continue without modifying the input document, as if * none of the `add_class()` or `remove_class()` calls had been made. * * This flag is set upon the first change that requires a string update. * * @var bool $modified */ $modified = false; // Remove unwanted classes by only copying the new ones. $existing_class_length = strlen( $existing_class ); while ( $at < $existing_class_length ) { // Skip to the first non-whitespace character. $ws_at = $at; $ws_length = strspn( $existing_class, " \t\f\r\n", $ws_at ); $at += $ws_length; // Capture the class name – it's everything until the next whitespace. $name_length = strcspn( $existing_class, " \t\f\r\n", $at ); if ( 0 === $name_length ) { // If no more class names are found then that's the end. break; } $name = substr( $existing_class, $at, $name_length ); $at += $name_length; // If this class is marked for removal, start processing the next one. $remove_class = ( isset( $this->classname_updates[ $name ] ) && self::REMOVE_CLASS === $this->classname_updates[ $name ] ); // If a class has already been seen then skip it; it should not be added twice. if ( ! $remove_class ) { $this->classname_updates[ $name ] = self::SKIP_CLASS; } if ( $remove_class ) { $modified = true; continue; } /* * Otherwise, append it to the new "class" attribute value. * * There are options for handling whitespace between tags. * Preserving the existing whitespace produces fewer changes * to the HTML content and should clarify the before/after * content when debugging the modified output. * * This approach contrasts normalizing the inter-class * whitespace to a single space, which might appear cleaner * in the output HTML but produce a noisier change. */ $class .= substr( $existing_class, $ws_at, $ws_length ); $class .= $name; } // Add new classes by appending those which haven't already been seen. foreach ( $this->classname_updates as $name => $operation ) { if ( self::ADD_CLASS === $operation ) { $modified = true; $class .= strlen( $class ) > 0 ? ' ' : ''; $class .= $name; } } $this->classname_updates = array(); if ( ! $modified ) { return; } if ( strlen( $class ) > 0 ) { $this->set_attribute( 'class', $class ); } else { $this->remove_attribute( 'class' ); } } /** * Applies attribute updates to HTML document. * * @since 6.2.0 * @since 6.2.1 Accumulates shift for internal cursor and passed pointer. * @since 6.3.0 Invalidate any bookmarks whose targets are overwritten. * * @param int $shift_this_point Accumulate and return shift for this position. * @return int How many bytes the given pointer moved in response to the updates. */ private function apply_attributes_updates( $shift_this_point = 0 ) { if ( ! count( $this->lexical_updates ) ) { return 0; } $accumulated_shift_for_given_point = 0; /* * Attribute updates can be enqueued in any order but updates * to the document must occur in lexical order; that is, each * replacement must be made before all others which follow it * at later string indices in the input document. * * Sorting avoid making out-of-order replacements which * can lead to mangled output, partially-duplicated * attributes, and overwritten attributes. */ usort( $this->lexical_updates, array( self::class, 'sort_start_ascending' ) ); $bytes_already_copied = 0; $output_buffer = ''; foreach ( $this->lexical_updates as $diff ) { $shift = strlen( $diff->text ) - $diff->length; // Adjust the cursor position by however much an update affects it. if ( $diff->start < $this->bytes_already_parsed ) { $this->bytes_already_parsed += $shift; } // Accumulate shift of the given pointer within this function call. if ( $diff->start <= $shift_this_point ) { $accumulated_shift_for_given_point += $shift; } $output_buffer .= substr( $this->html, $bytes_already_copied, $diff->start - $bytes_already_copied ); $output_buffer .= $diff->text; $bytes_already_copied = $diff->start + $diff->length; } $this->html = $output_buffer . substr( $this->html, $bytes_already_copied ); /* * Adjust bookmark locations to account for how the text * replacements adjust offsets in the input document. */ foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { $bookmark_end = $bookmark->start + $bookmark->length; /* * Each lexical update which appears before the bookmark's endpoints * might shift the offsets for those endpoints. Loop through each change * and accumulate the total shift for each bookmark, then apply that * shift after tallying the full delta. */ $head_delta = 0; $tail_delta = 0; foreach ( $this->lexical_updates as $diff ) { $diff_end = $diff->start + $diff->length; if ( $bookmark->start < $diff->start && $bookmark_end < $diff->start ) { break; } if ( $bookmark->start >= $diff->start && $bookmark_end < $diff_end ) { $this->release_bookmark( $bookmark_name ); continue 2; } $delta = strlen( $diff->text ) - $diff->length; if ( $bookmark->start >= $diff->start ) { $head_delta += $delta; } if ( $bookmark_end >= $diff_end ) { $tail_delta += $delta; } } $bookmark->start += $head_delta; $bookmark->length += $tail_delta - $head_delta; } $this->lexical_updates = array(); return $accumulated_shift_for_given_point; } /** * Checks whether a bookmark with the given name exists. * * @since 6.3.0 * * @param string $bookmark_name Name to identify a bookmark that potentially exists. * @return bool Whether that bookmark exists. */ public function has_bookmark( $bookmark_name ) { return array_key_exists( $bookmark_name, $this->bookmarks ); } /** * Move the internal cursor in the Tag Processor to a given bookmark's location. * * In order to prevent accidental infinite loops, there's a * maximum limit on the number of times seek() can be called. * * @since 6.2.0 * * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. * @return bool Whether the internal cursor was successfully moved to the bookmark's location. */ public function seek( $bookmark_name ) { if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) { _doing_it_wrong( __METHOD__, __( 'Unknown bookmark name.' ), '6.2.0' ); return false; } if ( ++$this->seek_count > static::MAX_SEEK_OPS ) { _doing_it_wrong( __METHOD__, __( 'Too many calls to seek() - this can lead to performance issues.' ), '6.2.0' ); return false; } // Flush out any pending updates to the document. $this->get_updated_html(); // Point this tag processor before the sought tag opener and consume it. $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; $this->parser_state = self::STATE_READY; return $this->next_token(); } /** * Compare two WP_HTML_Text_Replacement objects. * * @since 6.2.0 * * @param WP_HTML_Text_Replacement $a First attribute update. * @param WP_HTML_Text_Replacement $b Second attribute update. * @return int Comparison value for string order. */ private static function sort_start_ascending( $a, $b ) { $by_start = $a->start - $b->start; if ( 0 !== $by_start ) { return $by_start; } $by_text = isset( $a->text, $b->text ) ? strcmp( $a->text, $b->text ) : 0; if ( 0 !== $by_text ) { return $by_text; } /* * This code should be unreachable, because it implies the two replacements * start at the same location and contain the same text. */ return $a->length - $b->length; } /** * Return the enqueued value for a given attribute, if one exists. * * Enqueued updates can take different data types: * - If an update is enqueued and is boolean, the return will be `true` * - If an update is otherwise enqueued, the return will be the string value of that update. * - If an attribute is enqueued to be removed, the return will be `null` to indicate that. * - If no updates are enqueued, the return will be `false` to differentiate from "removed." * * @since 6.2.0 * * @param string $comparable_name The attribute name in its comparable form. * @return string|boolean|null Value of enqueued update if present, otherwise false. */ private function get_enqueued_attribute_value( $comparable_name ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return false; } if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { return false; } $enqueued_text = $this->lexical_updates[ $comparable_name ]->text; // Removed attributes erase the entire span. if ( '' === $enqueued_text ) { return null; } /* * Boolean attribute updates are just the attribute name without a corresponding value. * * This value might differ from the given comparable name in that there could be leading * or trailing whitespace, and that the casing follows the name given in `set_attribute`. * * Example: * * $p->set_attribute( 'data-TEST-id', 'update' ); * 'update' === $p->get_enqueued_attribute_value( 'data-test-id' ); * * Detect this difference based on the absence of the `=`, which _must_ exist in any * attribute containing a value, e.g. ``. * ¹ ² * 1. Attribute with a string value. * 2. Boolean attribute whose value is `true`. */ $equals_at = strpos( $enqueued_text, '=' ); if ( false === $equals_at ) { return true; } /* * Finally, a normal update's value will appear after the `=` and * be double-quoted, as performed incidentally by `set_attribute`. * * e.g. `type="text"` * ¹² ³ * 1. Equals is here. * 2. Double-quoting starts one after the equals sign. * 3. Double-quoting ends at the last character in the update. */ $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 ); return html_entity_decode( $enqueued_value ); } /** * Returns the value of a requested attribute from a matched tag opener if that attribute exists. * * Example: * * $p = new WP_HTML_Tag_Processor( '
Test
' ); * $p->next_tag( array( 'class_name' => 'test' ) ) === true; * $p->get_attribute( 'data-test-id' ) === '14'; * $p->get_attribute( 'enabled' ) === true; * $p->get_attribute( 'aria-label' ) === null; * * $p->next_tag() === false; * $p->get_attribute( 'class' ) === null; * * @since 6.2.0 * * @param string $name Name of attribute whose value is requested. * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. */ public function get_attribute( $name ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return null; } $comparable = strtolower( $name ); /* * For every attribute other than `class` it's possible to perform a quick check if * there's an enqueued lexical update whose value takes priority over what's found in * the input document. * * The `class` attribute is special though because of the exposed helpers `add_class` * and `remove_class`. These form a builder for the `class` attribute, so an additional * check for enqueued class changes is required in addition to the check for any enqueued * attribute values. If any exist, those enqueued class changes must first be flushed out * into an attribute value update. */ if ( 'class' === $name ) { $this->class_name_updates_to_attributes_updates(); } // Return any enqueued attribute value updates if they exist. $enqueued_value = $this->get_enqueued_attribute_value( $comparable ); if ( false !== $enqueued_value ) { return $enqueued_value; } if ( ! isset( $this->attributes[ $comparable ] ) ) { return null; } $attribute = $this->attributes[ $comparable ]; /* * This flag distinguishes an attribute with no value * from an attribute with an empty string value. For * unquoted attributes this could look very similar. * It refers to whether an `=` follows the name. * * e.g.
* ¹ ² * 1. Attribute `boolean-attribute` is `true`. * 2. Attribute `empty-attribute` is `""`. */ if ( true === $attribute->is_true ) { return true; } $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); return html_entity_decode( $raw_value ); } /** * Gets lowercase names of all attributes matching a given prefix in the current tag. * * Note that matching is case-insensitive. This is in accordance with the spec: * * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * Example: * * $p = new WP_HTML_Tag_Processor( '
Test
' ); * $p->next_tag( array( 'class_name' => 'test' ) ) === true; * $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' ); * * $p->next_tag() === false; * $p->get_attribute_names_with_prefix( 'data-' ) === null; * * @since 6.2.0 * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive * * @param string $prefix Prefix of requested attribute names. * @return array|null List of attribute names, or `null` when no tag opener is matched. */ public function get_attribute_names_with_prefix( $prefix ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return null; } $comparable = strtolower( $prefix ); $matches = array(); foreach ( array_keys( $this->attributes ) as $attr_name ) { if ( str_starts_with( $attr_name, $comparable ) ) { $matches[] = $attr_name; } } return $matches; } /** * Returns the uppercase name of the matched tag. * * Example: * * $p = new WP_HTML_Tag_Processor( '
Test
' ); * $p->next_tag() === true; * $p->get_tag() === 'DIV'; * * $p->next_tag() === false; * $p->get_tag() === null; * * @since 6.2.0 * * @return string|null Name of currently matched tag in input HTML, or `null` if none found. */ public function get_tag() { if ( null === $this->tag_name_starts_at ) { return null; } $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); if ( self::STATE_MATCHED_TAG === $this->parser_state ) { return strtoupper( $tag_name ); } if ( self::STATE_COMMENT === $this->parser_state && self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type() ) { return $tag_name; } return null; } /** * Indicates if the currently matched tag contains the self-closing flag. * * No HTML elements ought to have the self-closing flag and for those, the self-closing * flag will be ignored. For void elements this is benign because they "self close" * automatically. For non-void HTML elements though problems will appear if someone * intends to use a self-closing element in place of that element with an empty body. * For HTML foreign elements and custom elements the self-closing flag determines if * they self-close or not. * * This function does not determine if a tag is self-closing, * but only if the self-closing flag is present in the syntax. * * @since 6.3.0 * * @return bool Whether the currently matched tag contains the self-closing flag. */ public function has_self_closing_flag() { if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { return false; } /* * The self-closing flag is the solidus at the _end_ of the tag, not the beginning. * * Example: * *
* ^ this appears one character before the end of the closing ">". */ return '/' === $this->html[ $this->token_starts_at + $this->token_length - 1 ]; } /** * Indicates if the current tag token is a tag closer. * * Example: * * $p = new WP_HTML_Tag_Processor( '
' ); * $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) ); * $p->is_tag_closer() === false; * * $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) ); * $p->is_tag_closer() === true; * * @since 6.2.0 * * @return bool Whether the current tag is a tag closer. */ public function is_tag_closer() { return ( self::STATE_MATCHED_TAG === $this->parser_state && $this->is_closing_tag ); } /** * Indicates the kind of matched token, if any. * * This differs from `get_token_name()` in that it always * returns a static string indicating the type, whereas * `get_token_name()` may return values derived from the * token itself, such as a tag name or processing * instruction tag. * * Possible values: * - `#tag` when matched on a tag. * - `#text` when matched on a text node. * - `#cdata-section` when matched on a CDATA node. * - `#comment` when matched on a comment. * - `#doctype` when matched on a DOCTYPE declaration. * - `#presumptuous-tag` when matched on an empty tag closer. * - `#funky-comment` when matched on a funky comment. * * @since 6.5.0 * * @return string|null What kind of token is matched, or null. */ public function get_token_type() { switch ( $this->parser_state ) { case self::STATE_MATCHED_TAG: return '#tag'; case self::STATE_DOCTYPE: return '#doctype'; default: return $this->get_token_name(); } } /** * Returns the node name represented by the token. * * This matches the DOM API value `nodeName`. Some values * are static, such as `#text` for a text node, while others * are dynamically generated from the token itself. * * Dynamic names: * - Uppercase tag name for tag matches. * - `html` for DOCTYPE declarations. * * Note that if the Tag Processor is not matched on a token * then this function will return `null`, either because it * hasn't yet found a token or because it reached the end * of the document without matching a token. * * @since 6.5.0 * * @return string|null Name of the matched token. */ public function get_token_name() { switch ( $this->parser_state ) { case self::STATE_MATCHED_TAG: return $this->get_tag(); case self::STATE_TEXT_NODE: return '#text'; case self::STATE_CDATA_NODE: return '#cdata-section'; case self::STATE_COMMENT: return '#comment'; case self::STATE_DOCTYPE: return 'html'; case self::STATE_PRESUMPTUOUS_TAG: return '#presumptuous-tag'; case self::STATE_FUNKY_COMMENT: return '#funky-comment'; } } /** * Indicates what kind of comment produced the comment node. * * Because there are different kinds of HTML syntax which produce * comments, the Tag Processor tracks and exposes this as a type * for the comment. Nominally only regular HTML comments exist as * they are commonly known, but a number of unrelated syntax errors * also produce comments. * * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT * @see self::COMMENT_AS_CDATA_LOOKALIKE * @see self::COMMENT_AS_INVALID_HTML * @see self::COMMENT_AS_HTML_COMMENT * @see self::COMMENT_AS_PI_NODE_LOOKALIKE * * @since 6.5.0 * * @return string|null */ public function get_comment_type() { if ( self::STATE_COMMENT !== $this->parser_state ) { return null; } return $this->comment_type; } /** * Returns the modifiable text for a matched token, or an empty string. * * Modifiable text is text content that may be read and changed without * changing the HTML structure of the document around it. This includes * the contents of `#text` nodes in the HTML as well as the inner * contents of HTML comments, Processing Instructions, and others, even * though these nodes aren't part of a parsed DOM tree. They also contain * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any * other section in an HTML document which cannot contain HTML markup (DATA). * * If a token has no modifiable text then an empty string is returned to * avoid needless crashing or type errors. An empty string does not mean * that a token has modifiable text, and a token with modifiable text may * have an empty string (e.g. a comment with no contents). * * @since 6.5.0 * * @return string */ public function get_modifiable_text() { if ( null === $this->text_starts_at ) { return ''; } $text = substr( $this->html, $this->text_starts_at, $this->text_length ); // Comment data is not decoded. if ( self::STATE_CDATA_NODE === $this->parser_state || self::STATE_COMMENT === $this->parser_state || self::STATE_DOCTYPE === $this->parser_state || self::STATE_FUNKY_COMMENT === $this->parser_state ) { return $text; } $tag_name = $this->get_tag(); if ( // Script data is not decoded. 'SCRIPT' === $tag_name || // RAWTEXT data is not decoded. 'IFRAME' === $tag_name || 'NOEMBED' === $tag_name || 'NOFRAMES' === $tag_name || 'STYLE' === $tag_name || 'XMP' === $tag_name ) { return $text; } $decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE ); /* * TEXTAREA skips a leading newline, but this newline may appear not only as the * literal character `\n`, but also as a character reference, such as in the * following markup: ``. * * For these cases it's important to first decode the text content before checking * for a leading newline and removing it. */ if ( self::STATE_MATCHED_TAG === $this->parser_state && 'TEXTAREA' === $tag_name && strlen( $decoded ) > 0 && "\n" === $decoded[0] ) { return substr( $decoded, 1 ); } return $decoded; } /** * Updates or creates a new attribute on the currently matched tag with the passed value. * * For boolean attributes special handling is provided: * - When `true` is passed as the value, then only the attribute name is added to the tag. * - When `false` is passed, the attribute gets removed if it existed before. * * For string attributes, the value is escaped using the `esc_attr` function. * * @since 6.2.0 * @since 6.2.1 Fix: Only create a single update for multiple calls with case-variant attribute names. * * @param string $name The attribute name to target. * @param string|bool $value The new attribute value. * @return bool Whether an attribute value was set. */ public function set_attribute( $name, $value ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } /* * WordPress rejects more characters than are strictly forbidden * in HTML5. This is to prevent additional security risks deeper * in the WordPress and plugin stack. Specifically the * less-than (<) greater-than (>) and ampersand (&) aren't allowed. * * The use of a PCRE match enables looking for specific Unicode * code points without writing a UTF-8 decoder. Whereas scanning * for one-byte characters is trivial (with `strcspn`), scanning * for the longer byte sequences would be more complicated. Given * that this shouldn't be in the hot path for execution, it's a * reasonable compromise in efficiency without introducing a * noticeable impact on the overall system. * * @see https://html.spec.whatwg.org/#attributes-2 * * @todo As the only regex pattern maybe we should take it out? * Are Unicode patterns available broadly in Core? */ if ( preg_match( '~[' . // Syntax-like characters. '"\'>& The values "true" and "false" are not allowed on boolean attributes. * > To represent a false value, the attribute has to be omitted altogether. * - HTML5 spec, https://html.spec.whatwg.org/#boolean-attributes */ if ( false === $value ) { return $this->remove_attribute( $name ); } if ( true === $value ) { $updated_attribute = $name; } else { $escaped_new_value = esc_attr( $value ); $updated_attribute = "{$name}=\"{$escaped_new_value}\""; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $comparable_name = strtolower( $name ); if ( isset( $this->attributes[ $comparable_name ] ) ) { /* * Update an existing attribute. * * Example – set attribute id to "new" in
: * *
* ^-------------^ * start end * replacement: `id="new"` * * Result:
*/ $existing_attribute = $this->attributes[ $comparable_name ]; $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $existing_attribute->start, $existing_attribute->length, $updated_attribute ); } else { /* * Create a new attribute at the tag's name end. * * Example – add attribute id="new" to
: * *
* ^ * start and end * replacement: ` id="new"` * * Result:
*/ $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $this->tag_name_starts_at + $this->tag_name_length, 0, ' ' . $updated_attribute ); } /* * Any calls to update the `class` attribute directly should wipe out any * enqueued class changes from `add_class` and `remove_class`. */ if ( 'class' === $comparable_name && ! empty( $this->classname_updates ) ) { $this->classname_updates = array(); } return true; } /** * Remove an attribute from the currently-matched tag. * * @since 6.2.0 * * @param string $name The attribute name to remove. * @return bool Whether an attribute was removed. */ public function remove_attribute( $name ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } /* * > There must never be two or more attributes on * > the same start tag whose names are an ASCII * > case-insensitive match for each other. * - HTML 5 spec * * @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive */ $name = strtolower( $name ); /* * Any calls to update the `class` attribute directly should wipe out any * enqueued class changes from `add_class` and `remove_class`. */ if ( 'class' === $name && count( $this->classname_updates ) !== 0 ) { $this->classname_updates = array(); } /* * If updating an attribute that didn't exist in the input * document, then remove the enqueued update and move on. * * For example, this might occur when calling `remove_attribute()` * after calling `set_attribute()` for the same attribute * and when that attribute wasn't originally present. */ if ( ! isset( $this->attributes[ $name ] ) ) { if ( isset( $this->lexical_updates[ $name ] ) ) { unset( $this->lexical_updates[ $name ] ); } return false; } /* * Removes an existing tag attribute. * * Example – remove the attribute id from
: *
* ^-------------^ * start end * replacement: `` * * Result:
*/ $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( $this->attributes[ $name ]->start, $this->attributes[ $name ]->length, '' ); // Removes any duplicated attributes if they were also present. if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) { foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) { $this->lexical_updates[] = new WP_HTML_Text_Replacement( $attribute_token->start, $attribute_token->length, '' ); } } return true; } /** * Adds a new class name to the currently matched tag. * * @since 6.2.0 * * @param string $class_name The class name to add. * @return bool Whether the class was set to be added. */ public function add_class( $class_name ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } $this->classname_updates[ $class_name ] = self::ADD_CLASS; return true; } /** * Removes a class name from the currently matched tag. * * @since 6.2.0 * * @param string $class_name The class name to remove. * @return bool Whether the class was set to be removed. */ public function remove_class( $class_name ) { if ( self::STATE_MATCHED_TAG !== $this->parser_state || $this->is_closing_tag ) { return false; } if ( null !== $this->tag_name_starts_at ) { $this->classname_updates[ $class_name ] = self::REMOVE_CLASS; } return true; } /** * Returns the string representation of the HTML Tag Processor. * * @since 6.2.0 * * @see WP_HTML_Tag_Processor::get_updated_html() * * @return string The processed HTML. */ public function __toString() { return $this->get_updated_html(); } /** * Returns the string representation of the HTML Tag Processor. * * @since 6.2.0 * @since 6.2.1 Shifts the internal cursor corresponding to the applied updates. * @since 6.4.0 No longer calls subclass method `next_tag()` after updating HTML. * * @return string The processed HTML. */ public function get_updated_html() { $requires_no_updating = 0 === count( $this->classname_updates ) && 0 === count( $this->lexical_updates ); /* * When there is nothing more to update and nothing has already been * updated, return the original document and avoid a string copy. */ if ( $requires_no_updating ) { return $this->html; } /* * Keep track of the position right before the current tag. This will * be necessary for reparsing the current tag after updating the HTML. */ $before_current_tag = $this->token_starts_at; /* * 1. Apply the enqueued edits and update all the pointers to reflect those changes. */ $this->class_name_updates_to_attributes_updates(); $before_current_tag += $this->apply_attributes_updates( $before_current_tag ); /* * 2. Rewind to before the current tag and reparse to get updated attributes. * * At this point the internal cursor points to the end of the tag name. * Rewind before the tag name starts so that it's as if the cursor didn't * move; a call to `next_tag()` will reparse the recently-updated attributes * and additional calls to modify the attributes will apply at this same * location, but in order to avoid issues with subclasses that might add * behaviors to `next_tag()`, the internal methods should be called here * instead. * * It's important to note that in this specific place there will be no change * because the processor was already at a tag when this was called and it's * rewinding only to the beginning of this very tag before reprocessing it * and its attributes. * *

Previous HTMLMore HTML

* ↑ │ back up by the length of the tag name plus the opening < * └←─┘ back up by strlen("em") + 1 ==> 3 */ $this->bytes_already_parsed = $before_current_tag; $this->base_class_next_token(); return $this->html; } /** * Parses tag query input into internal search criteria. * * @since 6.2.0 * * @param array|string|null $query { * Optional. Which tag name to find, having which class, etc. Default is to find any tag. * * @type string|null $tag_name Which tag to find, or `null` for "any tag." * @type int|null $match_offset Find the Nth tag matching all search criteria. * 1 for "first" tag, 3 for "third," etc. * Defaults to first tag. * @type string|null $class_name Tag must contain this class name to match. * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g.
. * } */ private function parse_query( $query ) { if ( null !== $query && $query === $this->last_query ) { return; } $this->last_query = $query; $this->sought_tag_name = null; $this->sought_class_name = null; $this->sought_match_offset = 1; $this->stop_on_tag_closers = false; // A single string value means "find the tag of this name". if ( is_string( $query ) ) { $this->sought_tag_name = $query; return; } // An empty query parameter applies no restrictions on the search. if ( null === $query ) { return; } // If not using the string interface, an associative array is required. if ( ! is_array( $query ) ) { _doing_it_wrong( __METHOD__, __( 'The query argument must be an array or a tag name.' ), '6.2.0' ); return; } if ( isset( $query['tag_name'] ) && is_string( $query['tag_name'] ) ) { $this->sought_tag_name = $query['tag_name']; } if ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) { $this->sought_class_name = $query['class_name']; } if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) { $this->sought_match_offset = $query['match_offset']; } if ( isset( $query['tag_closers'] ) ) { $this->stop_on_tag_closers = 'visit' === $query['tag_closers']; } } /** * Checks whether a given tag and its attributes match the search criteria. * * @since 6.2.0 * * @return bool Whether the given tag and its attribute match the search criteria. */ private function matches() { if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { return false; } // Does the tag name match the requested tag name in a case-insensitive manner? if ( null !== $this->sought_tag_name ) { /* * String (byte) length lookup is fast. If they aren't the * same length then they can't be the same string values. */ if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) { return false; } /* * Check each character to determine if they are the same. * Defer calls to `strtoupper()` to avoid them when possible. * Calling `strcasecmp()` here tested slowed than comparing each * character, so unless benchmarks show otherwise, it should * not be used. * * It's expected that most of the time that this runs, a * lower-case tag name will be supplied and the input will * contain lower-case tag names, thus normally bypassing * the case comparison code. */ for ( $i = 0; $i < $this->tag_name_length; $i++ ) { $html_char = $this->html[ $this->tag_name_starts_at + $i ]; $tag_char = $this->sought_tag_name[ $i ]; if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) { return false; } } } if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) { return false; } return true; } /** * Parser Ready State. * * Indicates that the parser is ready to run and waiting for a state transition. * It may not have started yet, or it may have just finished parsing a token and * is ready to find the next one. * * @since 6.5.0 * * @access private */ const STATE_READY = 'STATE_READY'; /** * Parser Complete State. * * Indicates that the parser has reached the end of the document and there is * nothing left to scan. It finished parsing the last token completely. * * @since 6.5.0 * * @access private */ const STATE_COMPLETE = 'STATE_COMPLETE'; /** * Parser Incomplete Input State. * * Indicates that the parser has reached the end of the document before finishing * a token. It started parsing a token but there is a possibility that the input * HTML document was truncated in the middle of a token. * * The parser is reset at the start of the incomplete token and has paused. There * is nothing more than can be scanned unless provided a more complete document. * * @since 6.5.0 * * @access private */ const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT'; /** * Parser Matched Tag State. * * Indicates that the parser has found an HTML tag and it's possible to get * the tag name and read or modify its attributes (if it's not a closing tag). * * @since 6.5.0 * * @access private */ const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; /** * Parser Text Node State. * * Indicates that the parser has found a text node and it's possible * to read and modify that text. * * @since 6.5.0 * * @access private */ const STATE_TEXT_NODE = 'STATE_TEXT_NODE'; /** * Parser CDATA Node State. * * Indicates that the parser has found a CDATA node and it's possible * to read and modify its modifiable text. Note that in HTML there are * no CDATA nodes outside of foreign content (SVG and MathML). Outside * of foreign content, they are treated as HTML comments. * * @since 6.5.0 * * @access private */ const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; /** * Indicates that the parser has found an HTML comment and it's * possible to read and modify its modifiable text. * * @since 6.5.0 * * @access private */ const STATE_COMMENT = 'STATE_COMMENT'; /** * Indicates that the parser has found a DOCTYPE node and it's * possible to read and modify its modifiable text. * * @since 6.5.0 * * @access private */ const STATE_DOCTYPE = 'STATE_DOCTYPE'; /** * Indicates that the parser has found an empty tag closer ``. * * Note that in HTML there are no empty tag closers, and they * are ignored. Nonetheless, the Tag Processor still * recognizes them as they appear in the HTML stream. * * These were historically discussed as a "presumptuous tag * closer," which would close the nearest open tag, but were * dismissed in favor of explicitly-closing tags. * * @since 6.5.0 * * @access private */ const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG'; /** * Indicates that the parser has found a "funky comment" * and it's possible to read and modify its modifiable text. * * Example: * * * * * * Funky comments are tag closers with invalid tag names. Note * that in HTML these are turn into bogus comments. Nonetheless, * the Tag Processor recognizes them in a stream of HTML and * exposes them for inspection and modification. * * @since 6.5.0 * * @access private */ const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY'; /** * Indicates that a comment was created when encountering abruptly-closed HTML comment. * * Example: * * * * * @since 6.5.0 */ const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT'; /** * Indicates that a comment would be parsed as a CDATA node, * were HTML to allow CDATA nodes outside of foreign content. * * Example: * * * * This is an HTML comment, but it looks like a CDATA node. * * @since 6.5.0 */ const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE'; /** * Indicates that a comment was created when encountering * normative HTML comment syntax. * * Example: * * * * @since 6.5.0 */ const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT'; /** * Indicates that a comment would be parsed as a Processing * Instruction node, were they to exist within HTML. * * Example: * * * * This is an HTML comment, but it looks like a CDATA node. * * @since 6.5.0 */ const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE'; /** * Indicates that a comment was created when encountering invalid * HTML input, a so-called "bogus comment." * * Example: * * * * * @since 6.5.0 */ const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML'; }