Changeset 57348
- Timestamp:
- 01/24/2024 11:35:46 PM (6 months ago)
- Location:
- trunk
- Files:
-
- 1 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/html-api/class-wp-html-processor.php
r57343 r57348 151 151 152 152 /** 153 * Static query for instructing the Tag Processor to visit every token.154 *155 * @access private156 *157 * @since 6.4.0158 *159 * @var array160 */161 const VISIT_EVERYTHING = array( 'tag_closers' => 'visit' );162 163 /**164 153 * Holds the working state of the parser, including the stack of 165 154 * open elements and the stack of active formatting elements. … … 423 412 424 413 return false; 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 425 438 } 426 439 … … 521 534 } 522 535 523 parent::next_tag( self::VISIT_EVERYTHING ); 536 while ( parent::next_token() && '#tag' !== $this->get_token_type() ) { 537 continue; 538 } 524 539 } 525 540 -
trunk/src/wp-includes/html-api/class-wp-html-tag-processor.php
r57227 r57348 248 248 * } 249 249 * 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 250 339 * ## Design and limitations 251 340 * … … 321 410 * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE. 322 411 * @since 6.5.0 Pauses processor when input ends in an incomplete syntax token. 323 * Introduces "special" elements which act like void elements, e.g. STYLE. 412 * Introduces "special" elements which act like void elements, e.g. TITLE, STYLE. 413 * Allows scanning through all tokens and processing modifiable text, where applicable. 324 414 */ 325 415 class WP_HTML_Tag_Processor { … … 397 487 * Specifies mode of operation of the parser at any given time. 398 488 * 399 * | State | Meaning | 400 * | --------------|----------------------------------------------------------------------| 401 * | *Ready* | The parser is ready to run. | 402 * | *Complete* | There is nothing left to parse. | 403 * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | 404 * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | 489 * | State | Meaning | 490 * | ----------------|----------------------------------------------------------------------| 491 * | *Ready* | The parser is ready to run. | 492 * | *Complete* | There is nothing left to parse. | 493 * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | 494 * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | 495 * | *Text node* | Found a #text node; this is plaintext and modifiable. | 496 * | *CDATA node* | Found a CDATA section; this is modifiable. | 497 * | *Comment* | Found a comment or bogus comment; this is modifiable. | 498 * | *Presumptuous* | Found an empty tag closer: `</>`. | 499 * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. | 405 500 * 406 501 * @since 6.5.0 … … 408 503 * @see WP_HTML_Tag_Processor::STATE_READY 409 504 * @see WP_HTML_Tag_Processor::STATE_COMPLETE 410 * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE 505 * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE 411 506 * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG 507 508 509 510 511 512 412 513 * 413 514 * @var string 414 515 */ 415 private $parser_state = self::STATE_READY; 516 protected $parser_state = self::STATE_READY; 517 518 /** 519 * What kind of syntax token became an HTML comment. 520 * 521 * Since there are many ways in which HTML syntax can create an HTML comment, 522 * this indicates which of those caused it. This allows the Tag Processor to 523 * represent more from the original input document than would appear in the DOM. 524 * 525 * @since 6.5.0 526 * 527 * @var string|null 528 */ 529 protected $comment_type = null; 416 530 417 531 /** … … 490 604 */ 491 605 private $tag_name_length; 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 492 624 493 625 /** … … 706 838 */ 707 839 public function next_token() { 840 708 841 $this->get_updated_html(); 709 $was_at = $this->bytes_already_parsed;710 842 711 843 // Don't proceed if there's nothing more to scan. 712 844 if ( 713 845 self::STATE_COMPLETE === $this->parser_state || 714 self::STATE_INCOMPLETE === $this->parser_state846 self::STATE_INCOMPLETE === $this->parser_state 715 847 ) { 716 848 return false; … … 730 862 // Find the next tag if it exists. 731 863 if ( false === $this->parse_next_tag() ) { 732 if ( self::STATE_INCOMPLETE === $this->parser_state ) {864 if ( self::STATE_INCOMPLETE === $this->parser_state ) { 733 865 $this->bytes_already_parsed = $was_at; 734 866 } 735 867 736 868 return false; 869 870 871 872 873 874 875 876 877 878 879 880 881 882 737 883 } 738 884 … … 744 890 // Ensure that the tag closes before the end of the document. 745 891 if ( 746 self::STATE_INCOMPLETE === $this->parser_state ||892 self::STATE_INCOMPLETE === $this->parser_state || 747 893 $this->bytes_already_parsed >= strlen( $this->html ) 748 894 ) { 749 895 // Does this appropriately clear state (parsed attributes)? 750 $this->parser_state = self::STATE_INCOMPLETE ;896 $this->parser_state = self::STATE_INCOMPLETE; 751 897 $this->bytes_already_parsed = $was_at; 752 898 … … 756 902 $tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed ); 757 903 if ( false === $tag_ends_at ) { 758 $this->parser_state = self::STATE_INCOMPLETE ;904 $this->parser_state = self::STATE_INCOMPLETE; 759 905 $this->bytes_already_parsed = $was_at; 760 906 … … 763 909 $this->parser_state = self::STATE_MATCHED_TAG; 764 910 $this->token_length = $tag_ends_at - $this->token_starts_at; 765 $this->bytes_already_parsed = $tag_ends_at ;911 $this->bytes_already_parsed = $tag_ends_at; 766 912 767 913 /* … … 772 918 $t = $this->html[ $this->tag_name_starts_at ]; 773 919 if ( 774 ! $this->is_closing_tag &&775 (920 921 ( 776 922 'i' === $t || 'I' === $t || 777 923 'n' === $t || 'N' === $t || … … 781 927 ) 782 928 ) { 783 $tag_name = $this->get_tag(); 784 785 if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { 786 $this->parser_state = self::STATE_INCOMPLETE; 787 $this->bytes_already_parsed = $was_at; 788 789 return false; 790 } elseif ( 791 ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && 792 ! $this->skip_rcdata( $tag_name ) 793 ) { 794 $this->parser_state = self::STATE_INCOMPLETE; 795 $this->bytes_already_parsed = $was_at; 796 797 return false; 798 } elseif ( 799 ( 800 'IFRAME' === $tag_name || 801 'NOEMBED' === $tag_name || 802 'NOFRAMES' === $tag_name || 803 'STYLE' === $tag_name || 804 'XMP' === $tag_name 805 ) && 806 ! $this->skip_rawtext( $tag_name ) 807 ) { 808 $this->parser_state = self::STATE_INCOMPLETE; 809 $this->bytes_already_parsed = $was_at; 810 811 return false; 812 } 813 } 929 return true; 930 } 931 932 $tag_name = $this->get_tag(); 933 934 /* 935 * Preserve the opening tag pointers, as these will be overwritten 936 * when finding the closing tag. They will be reset after finding 937 * the closing to tag to point to the opening of the special atomic 938 * tag sequence. 939 */ 940 $tag_name_starts_at = $this->tag_name_starts_at; 941 $tag_name_length = $this->tag_name_length; 942 $tag_ends_at = $this->token_starts_at + $this->token_length; 943 $attributes = $this->attributes; 944 $duplicate_attributes = $this->duplicate_attributes; 945 946 // Find the closing tag if necessary. 947 $found_closer = false; 948 switch ( $tag_name ) { 949 case 'SCRIPT': 950 $found_closer = $this->skip_script_data(); 951 break; 952 953 case 'TEXTAREA': 954 case 'TITLE': 955 $found_closer = $this->skip_rcdata( $tag_name ); 956 break; 957 958 /* 959 * In the browser this list would include the NOSCRIPT element, 960 * but the Tag Processor is an environment with the scripting 961 * flag disabled, meaning that it needs to descend into the 962 * NOSCRIPT element to be able to properly process what will be 963 * sent to a browser. 964 * 965 * Note that this rule makes HTML5 syntax incompatible with XML, 966 * because the parsing of this token depends on client application. 967 * The NOSCRIPT element cannot be represented in the XHTML syntax. 968 */ 969 case 'IFRAME': 970 case 'NOEMBED': 971 case 'NOFRAMES': 972 case 'STYLE': 973 case 'XMP': 974 $found_closer = $this->skip_rawtext( $tag_name ); 975 break; 976 977 // No other tags should be treated in their entirety here. 978 default: 979 return true; 980 } 981 982 if ( ! $found_closer ) { 983 $this->parser_state = self::STATE_INCOMPLETE_INPUT; 984 $this->bytes_already_parsed = $was_at; 985 return false; 986 } 987 988 /* 989 * The values here look like they reference the opening tag but they reference 990 * the closing tag instead. This is why the opening tag values were stored 991 * above in a variable. It reads confusingly here, but that's because the 992 * functions that skip the contents have moved all the internal cursors past 993 * the inner content of the tag. 994 */ 995 $this->token_starts_at = $was_at; 996 $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; 997 $this->text_starts_at = $tag_ends_at + 1; 998 $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; 999 $this->tag_name_starts_at = $tag_name_starts_at; 1000 $this->tag_name_length = $tag_name_length; 1001 $this->attributes = $attributes; 1002 $this->duplicate_attributes = $duplicate_attributes; 814 1003 815 1004 return true; … … 831 1020 */ 832 1021 public function paused_at_incomplete_token() { 833 return self::STATE_INCOMPLETE === $this->parser_state;1022 return self::STATE_INCOMPLETE === $this->parser_state; 834 1023 } 835 1024 … … 1008 1197 public function set_bookmark( $name ) { 1009 1198 // It only makes sense to set a bookmark if the parser has paused on a concrete token. 1010 if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { 1199 if ( 1200 self::STATE_COMPLETE === $this->parser_state || 1201 self::STATE_INCOMPLETE_INPUT === $this->parser_state 1202 ) { 1011 1203 return false; 1012 1204 } … … 1083 1275 1084 1276 while ( false !== $at && $at < $doc_length ) { 1085 $at = strpos( $this->html, '</', $at ); 1277 $at = strpos( $this->html, '</', $at ); 1278 $this->tag_name_starts_at = $at; 1086 1279 1087 1280 // Fail if there is no possible tag closer. … … 1090 1283 } 1091 1284 1092 $closer_potentially_starts_at = $at; 1093 $at += 2; 1285 $at += 2; 1094 1286 1095 1287 /* … … 1132 1324 continue; 1133 1325 } 1326 1134 1327 $at = $this->bytes_already_parsed; 1135 1328 if ( $at >= strlen( $this->html ) ) { … … 1137 1330 } 1138 1331 1139 if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { 1140 $this->bytes_already_parsed = $closer_potentially_starts_at; 1332 if ( '>' === $html[ $at ] ) { 1333 $this->bytes_already_parsed = $at + 1; 1334 return true; 1335 } 1336 1337 if ( $at + 1 >= strlen( $this->html ) ) { 1338 return false; 1339 } 1340 1341 if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) { 1342 $this->bytes_already_parsed = $at + 2; 1141 1343 return true; 1142 1344 } … … 1260 1462 if ( $is_closing ) { 1261 1463 $this->bytes_already_parsed = $closer_potentially_starts_at; 1464 1262 1465 if ( $this->bytes_already_parsed >= $doc_length ) { 1263 1466 return false; … … 1269 1472 1270 1473 if ( $this->bytes_already_parsed >= $doc_length ) { 1271 $this->parser_state = self::STATE_INCOMPLETE ;1474 $this->parser_state = self::STATE_INCOMPLETE; 1272 1475 1273 1476 return false; … … 1275 1478 1276 1479 if ( '>' === $html[ $this->bytes_already_parsed ] ) { 1277 $this->bytes_already_parsed = $closer_potentially_starts_at;1480 ; 1278 1481 return true; 1279 1482 } … … 1304 1507 $html = $this->html; 1305 1508 $doc_length = strlen( $html ); 1306 $at = $this->bytes_already_parsed; 1509 $was_at = $this->bytes_already_parsed; 1510 $at = $was_at; 1307 1511 1308 1512 while ( false !== $at && $at < $doc_length ) { 1309 1513 $at = strpos( $html, '<', $at ); 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1310 1524 1311 1525 /* … … 1314 1528 */ 1315 1529 if ( false === $at ) { 1316 return false; 1530 $this->parser_state = self::STATE_TEXT_NODE; 1531 $this->token_starts_at = $was_at; 1532 $this->token_length = strlen( $html ) - $was_at; 1533 $this->text_starts_at = $was_at; 1534 $this->text_length = $this->token_length; 1535 $this->bytes_already_parsed = strlen( $html ); 1536 return true; 1317 1537 } 1318 1538 … … 1343 1563 if ( $tag_name_prefix_length > 0 ) { 1344 1564 ++$at; 1565 1566 1345 1567 $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); 1346 $this->tag_name_starts_at = $at;1347 1568 $this->bytes_already_parsed = $at + $this->tag_name_length; 1348 1569 return true; … … 1354 1575 */ 1355 1576 if ( $at + 1 >= $doc_length ) { 1356 $this->parser_state = self::STATE_INCOMPLETE ;1577 $this->parser_state = self::STATE_INCOMPLETE; 1357 1578 1358 1579 return false; … … 1360 1581 1361 1582 /* 1362 * <!transitions to markup declaration open state1583 * transitions to markup declaration open state 1363 1584 * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state 1364 1585 */ 1365 1586 if ( '!' === $html[ $at + 1 ] ) { 1366 1587 /* 1367 * <!-- transitions to a bogus comment state – skip to the nearest -->1588 * 1368 1589 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state 1369 1590 */ … … 1376 1597 // If it's not possible to close the comment then there is nothing more to scan. 1377 1598 if ( $doc_length <= $closer_at ) { 1378 $this->parser_state = self::STATE_INCOMPLETE ;1599 $this->parser_state = self::STATE_INCOMPLETE; 1379 1600 1380 1601 return false; … … 1384 1605 $span_of_dashes = strspn( $html, '-', $closer_at ); 1385 1606 if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { 1386 $at = $closer_at + $span_of_dashes + 1; 1387 continue; 1607 /* 1608 * @todo When implementing `set_modifiable_text()` ensure that updates to this token 1609 * don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment 1610 * and bogus comment syntax, these leave no clear insertion point for text and 1611 * they need to be modified specially in order to contain text. E.g. to store 1612 * `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which 1613 * involves inserting an additional `-` into the token after the modifiable text. 1614 */ 1615 $this->parser_state = self::STATE_COMMENT; 1616 $this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT; 1617 $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at; 1618 1619 // Only provide modifiable text if the token is long enough to contain it. 1620 if ( $span_of_dashes >= 2 ) { 1621 $this->comment_type = self::COMMENT_AS_HTML_COMMENT; 1622 $this->text_starts_at = $this->token_starts_at + 4; 1623 $this->text_length = $span_of_dashes - 2; 1624 } 1625 1626 $this->bytes_already_parsed = $closer_at + $span_of_dashes + 1; 1627 return true; 1388 1628 } 1389 1629 … … 1398 1638 $closer_at = strpos( $html, '--', $closer_at ); 1399 1639 if ( false === $closer_at ) { 1400 $this->parser_state = self::STATE_INCOMPLETE ;1640 $this->parser_state = self::STATE_INCOMPLETE; 1401 1641 1402 1642 return false; … … 1404 1644 1405 1645 if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) { 1406 $at = $closer_at + 3; 1407 continue 2; 1646 $this->parser_state = self::STATE_COMMENT; 1647 $this->comment_type = self::COMMENT_AS_HTML_COMMENT; 1648 $this->token_length = $closer_at + 3 - $this->token_starts_at; 1649 $this->text_starts_at = $this->token_starts_at + 4; 1650 $this->text_length = $closer_at - $this->text_starts_at; 1651 $this->bytes_already_parsed = $closer_at + 3; 1652 return true; 1408 1653 } 1409 1654 1410 if ( $closer_at + 3 < $doc_length && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { 1411 $at = $closer_at + 4; 1412 continue 2; 1655 if ( 1656 $closer_at + 3 < $doc_length && 1657 '!' === $html[ $closer_at + 2 ] && 1658 '>' === $html[ $closer_at + 3 ] 1659 ) { 1660 $this->parser_state = self::STATE_COMMENT; 1661 $this->comment_type = self::COMMENT_AS_HTML_COMMENT; 1662 $this->token_length = $closer_at + 4 - $this->token_starts_at; 1663 $this->text_starts_at = $this->token_starts_at + 4; 1664 $this->text_length = $closer_at - $this->text_starts_at; 1665 $this->bytes_already_parsed = $closer_at + 4; 1666 return true; 1413 1667 } 1414 1668 } … … 1416 1670 1417 1671 /* 1418 * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]> 1419 * The CDATA is case-sensitive. 1420 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state 1421 */ 1422 if ( 1423 $doc_length > $at + 8 && 1424 '[' === $html[ $at + 2 ] && 1425 'C' === $html[ $at + 3 ] && 1426 'D' === $html[ $at + 4 ] && 1427 'A' === $html[ $at + 5 ] && 1428 'T' === $html[ $at + 6 ] && 1429 'A' === $html[ $at + 7 ] && 1430 '[' === $html[ $at + 8 ] 1431 ) { 1432 $closer_at = strpos( $html, ']]>', $at + 9 ); 1433 if ( false === $closer_at ) { 1434 $this->parser_state = self::STATE_INCOMPLETE; 1435 1436 return false; 1437 } 1438 1439 $at = $closer_at + 3; 1440 continue; 1441 } 1442 1443 /* 1444 * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest > 1672 * `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest > 1445 1673 * These are ASCII-case-insensitive. 1446 1674 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state … … 1458 1686 $closer_at = strpos( $html, '>', $at + 9 ); 1459 1687 if ( false === $closer_at ) { 1460 $this->parser_state = self::STATE_INCOMPLETE ;1688 $this->parser_state = self::STATE_INCOMPLETE; 1461 1689 1462 1690 return false; 1463 1691 } 1464 1692 1465 $at = $closer_at + 1; 1466 continue; 1693 $this->parser_state = self::STATE_DOCTYPE; 1694 $this->token_length = $closer_at + 1 - $this->token_starts_at; 1695 $this->text_starts_at = $this->token_starts_at + 9; 1696 $this->text_length = $closer_at - $this->text_starts_at; 1697 $this->bytes_already_parsed = $closer_at + 1; 1698 return true; 1467 1699 } 1468 1700 … … 1472 1704 * found then the HTML was truncated inside the markup declaration. 1473 1705 */ 1474 $ at = strpos( $html, '>', $at + 1 );1475 if ( false === $ at ) {1476 $this->parser_state = self::STATE_INCOMPLETE ;1706 $at = strpos( $html, '>', $at + 1 ); 1707 if ( false === $at ) { 1708 $this->parser_state = self::STATE_INCOMPLETE; 1477 1709 1478 1710 return false; 1479 1711 } 1480 1712 1481 continue; 1713 $this->parser_state = self::STATE_COMMENT; 1714 $this->comment_type = self::COMMENT_AS_INVALID_HTML; 1715 $this->token_length = $closer_at + 1 - $this->token_starts_at; 1716 $this->text_starts_at = $this->token_starts_at + 2; 1717 $this->text_length = $closer_at - $this->text_starts_at; 1718 $this->bytes_already_parsed = $closer_at + 1; 1719 1720 /* 1721 * Identify nodes that would be CDATA if HTML had CDATA sections. 1722 * 1723 * This section must occur after identifying the bogus comment end 1724 * because in an HTML parser it will span to the nearest `>`, even 1725 * if there's no `]]>` as would be required in an XML document. It 1726 * is therefore not possible to parse a CDATA section containing 1727 * a `>` in the HTML syntax. 1728 * 1729 * Inside foreign elements there is a discrepancy between browsers 1730 * and the specification on this. 1731 * 1732 * @todo Track whether the Tag Processor is inside a foreign element 1733 * and require the proper closing `]]>` in those cases. 1734 */ 1735 if ( 1736 $this->token_length >= 10 && 1737 '[' === $html[ $this->token_starts_at + 2 ] && 1738 'C' === $html[ $this->token_starts_at + 3 ] && 1739 'D' === $html[ $this->token_starts_at + 4 ] && 1740 'A' === $html[ $this->token_starts_at + 5 ] && 1741 'T' === $html[ $this->token_starts_at + 6 ] && 1742 'A' === $html[ $this->token_starts_at + 7 ] && 1743 '[' === $html[ $this->token_starts_at + 8 ] && 1744 ']' === $html[ $closer_at - 1 ] 1745 ) { 1746 $this->parser_state = self::STATE_COMMENT; 1747 $this->comment_type = self::COMMENT_AS_CDATA_LOOKALIKE; 1748 $this->text_starts_at += 7; 1749 $this->text_length -= 9; 1750 } 1751 1752 return true; 1482 1753 } 1483 1754 … … 1492 1763 */ 1493 1764 if ( '>' === $html[ $at + 1 ] ) { 1494 ++$at; 1495 continue; 1765 $this->parser_state = self::STATE_PRESUMPTUOUS_TAG; 1766 $this->token_length = $at + 2 - $this->token_starts_at; 1767 $this->bytes_already_parsed = $at + 2; 1768 return true; 1496 1769 } 1497 1770 1498 1771 /* 1499 * <?transitions to a bogus comment state – skip to the nearest >1772 * transitions to a bogus comment state – skip to the nearest > 1500 1773 * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state 1501 1774 */ … … 1503 1776 $closer_at = strpos( $html, '>', $at + 2 ); 1504 1777 if ( false === $closer_at ) { 1505 $this->parser_state = self::STATE_INCOMPLETE ;1778 $this->parser_state = self::STATE_INCOMPLETE; 1506 1779 1507 1780 return false; 1508 1781 } 1509 1782 1510 $at = $closer_at + 1; 1511 continue; 1783 $this->parser_state = self::STATE_COMMENT; 1784 $this->comment_type = self::COMMENT_AS_INVALID_HTML; 1785 $this->token_length = $closer_at + 1 - $this->token_starts_at; 1786 $this->text_starts_at = $this->token_starts_at + 2; 1787 $this->text_length = $closer_at - $this->text_starts_at; 1788 $this->bytes_already_parsed = $closer_at + 1; 1789 1790 /* 1791 * Identify a Processing Instruction node were HTML to have them. 1792 * 1793 * This section must occur after identifying the bogus comment end 1794 * because in an HTML parser it will span to the nearest `>`, even 1795 * if there's no `?>` as would be required in an XML document. It 1796 * is therefore not possible to parse a Processing Instruction node 1797 * containing a `>` in the HTML syntax. 1798 * 1799 * XML allows for more target names, but this code only identifies 1800 * those with ASCII-representable target names. This means that it 1801 * may identify some Processing Instruction nodes as bogus comments, 1802 * but it will not misinterpret the HTML structure. By limiting the 1803 * identification to these target names the Tag Processor can avoid 1804 * the need to start parsing UTF-8 sequences. 1805 * 1806 * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | 1807 * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | 1808 * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | 1809 * [#x10000-#xEFFFF] 1810 * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] 1811 * 1812 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget 1813 */ 1814 if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) { 1815 $comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 ); 1816 $pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' ); 1817 1818 if ( 0 < $pi_target_length ) { 1819 $pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length ); 1820 1821 $this->comment_type = self::COMMENT_AS_PI_NODE_LOOKALIKE; 1822 $this->tag_name_starts_at = $this->token_starts_at + 2; 1823 $this->tag_name_length = $pi_target_length; 1824 $this->text_starts_at += $pi_target_length; 1825 $this->text_length -= $pi_target_length + 1; 1826 } 1827 } 1828 1829 return true; 1512 1830 } 1513 1831 … … 1515 1833 * If a non-alpha starts the tag name in a tag closer it's a comment. 1516 1834 * Find the first `>`, which closes the comment. 1835 1836 1837 1517 1838 * 1518 1839 * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name … … 1526 1847 $closer_at = strpos( $html, '>', $at + 3 ); 1527 1848 if ( false === $closer_at ) { 1528 $this->parser_state = self::STATE_INCOMPLETE ;1849 $this->parser_state = self::STATE_INCOMPLETE; 1529 1850 1530 1851 return false; 1531 1852 } 1532 1853 1533 $at = $closer_at + 1; 1534 continue; 1854 $this->parser_state = self::STATE_FUNKY_COMMENT; 1855 $this->token_length = $closer_at + 1 - $this->token_starts_at; 1856 $this->text_starts_at = $this->token_starts_at + 2; 1857 $this->text_length = $closer_at - $this->text_starts_at; 1858 $this->bytes_already_parsed = $closer_at + 1; 1859 return true; 1535 1860 } 1536 1861 … … 1552 1877 $this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed ); 1553 1878 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1554 $this->parser_state = self::STATE_INCOMPLETE ;1879 $this->parser_state = self::STATE_INCOMPLETE; 1555 1880 1556 1881 return false; … … 1576 1901 $this->bytes_already_parsed += $name_length; 1577 1902 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1578 $this->parser_state = self::STATE_INCOMPLETE ;1903 $this->parser_state = self::STATE_INCOMPLETE; 1579 1904 1580 1905 return false; … … 1583 1908 $this->skip_whitespace(); 1584 1909 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1585 $this->parser_state = self::STATE_INCOMPLETE ;1910 $this->parser_state = self::STATE_INCOMPLETE; 1586 1911 1587 1912 return false; … … 1593 1918 $this->skip_whitespace(); 1594 1919 if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { 1595 $this->parser_state = self::STATE_INCOMPLETE ;1920 $this->parser_state = self::STATE_INCOMPLETE; 1596 1921 1597 1922 return false; … … 1621 1946 1622 1947 if ( $attribute_end >= strlen( $this->html ) ) { 1623 $this->parser_state = self::STATE_INCOMPLETE ;1948 $this->parser_state = self::STATE_INCOMPLETE; 1624 1949 1625 1950 return false; … … 1693 2018 $this->tag_name_starts_at = null; 1694 2019 $this->tag_name_length = null; 2020 2021 1695 2022 $this->is_closing_tag = null; 1696 2023 $this->attributes = array(); 2024 1697 2025 $this->duplicate_attributes = null; 1698 2026 } … … 1986 2314 // Point this tag processor before the sought tag opener and consume it. 1987 2315 $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; 1988 return $this->next_t ag( array( 'tag_closers' => 'visit' ));2316 return $this->next_t); 1989 2317 } 1990 2318 … … 2217 2545 */ 2218 2546 public function get_tag() { 2219 if ( self::STATE_MATCHED_TAG !== $this->parser_state) {2547 if ( ) { 2220 2548 return null; 2221 2549 } … … 2223 2551 $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); 2224 2552 2225 return strtoupper( $tag_name ); 2553 if ( self::STATE_MATCHED_TAG === $this->parser_state ) { 2554 return strtoupper( $tag_name ); 2555 } 2556 2557 if ( 2558 self::STATE_COMMENT === $this->parser_state && 2559 self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type() 2560 ) { 2561 return $tag_name; 2562 } 2563 2564 return null; 2226 2565 } 2227 2566 … … 2280 2619 $this->is_closing_tag 2281 2620 ); 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2282 2806 } 2283 2807 … … 2747 3271 2748 3272 /** 2749 * Parser Ready State 3273 * Parser Ready State 2750 3274 * 2751 3275 * Indicates that the parser is ready to run and waiting for a state transition. … … 2760 3284 2761 3285 /** 2762 * Parser Complete State 3286 * Parser Complete State 2763 3287 * 2764 3288 * Indicates that the parser has reached the end of the document and there is … … 2772 3296 2773 3297 /** 2774 * Parser Incomplete State3298 * Parser Incomplete 2775 3299 * 2776 3300 * Indicates that the parser has reached the end of the document before finishing … … 2785 3309 * @access private 2786 3310 */ 2787 const STATE_INCOMPLETE = 'STATE_INCOMPLETE';2788 2789 /** 2790 * Parser Matched Tag State 3311 const STATE_INCOMPLETE'; 3312 3313 /** 3314 * Parser Matched Tag State 2791 3315 * 2792 3316 * Indicates that the parser has found an HTML tag and it's possible to get … … 2798 3322 */ 2799 3323 const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 2800 3473 } -
trunk/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php
r57343 r57348 515 515 */ 516 516 public function test_can_seek_back_and_forth() { 517 $p = WP_HTML_Processor::create_fragment( '<div><p one><div><p><div two><p><div><p><div><p three>' ); 517 $p = WP_HTML_Processor::create_fragment( 518 <<<'HTML' 519 <div>text<p one>more stuff<div><![CDATA[this is not real CDATA]]><p><!-- hi --><div two><p><div><p>three comes soon<div><p three>' ); 520 HTML 521 ); 518 522 519 523 // Find first tag of interest. -
trunk/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php
r57211 r57348 558 558 559 559 $p->next_tag(); 560 $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the </script> tag closer' ); 561 $this->assertTrue( $p->is_tag_closer(), 'Indicated a <script> tag opener is a tag closer' ); 560 $this->assertFalse( 561 $p->next_tag( array( 'tag_closers' => 'visit' ) ), 562 'Should not have found closing SCRIPT tag when closing an opener.' 563 ); 562 564 563 565 $p = new WP_HTML_Tag_Processor( 'abc</script>' ); … … 567 569 568 570 $p->next_tag(); 569 $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the </textarea> tag closer' ); 570 $this->assertTrue( $p->is_tag_closer(), 'Indicated a <textarea> tag opener is a tag closer' ); 571 $this->assertFalse( 572 $p->next_tag( array( 'tag_closers' => 'visit' ) ), 573 'Should not have found closing TEXTAREA when closing an opener.' 574 ); 571 575 572 576 $p = new WP_HTML_Tag_Processor( 'abc</textarea>' ); … … 576 580 577 581 $p->next_tag(); 578 $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the </title> tag closer' ); 579 $this->assertTrue( $p->is_tag_closer(), 'Indicated a <title> tag opener is a tag closer' ); 582 $this->assertFalse( 583 $p->next_tag( array( 'tag_closers' => 'visit' ) ), 584 'Should not have found closing TITLE when closing an opener.' 585 ); 580 586 581 587 $p = new WP_HTML_Tag_Processor( 'abc</title>' ); … … 2358 2364 'Text with comments' => array( 'One <!-- sneaky --> comment.' ), 2359 2365 'Empty tag closer' => array( '</>' ), 2366 2360 2367 'Processing instruction' => array( '<?xml version="1.0"?>' ), 2361 2368 'Combination XML-like' => array( '<!DOCTYPE xml><?xml version=""?><!-- this is not a real document. --><![CDATA[it only serves as a test]]>' ), … … 2411 2418 'Partial CDATA' => array( '<![CDA' ), 2412 2419 'Partially closed CDATA]' => array( '<![CDATA[cannot escape]' ), 2413 'Partially closed CDATA]>' => array( '<![CDATA[cannot escape]>' ),2414 2420 'Unclosed IFRAME' => array( '<iframe><div>' ), 2415 2421 'Unclosed NOEMBED' => array( '<noembed><div>' ), … … 2508 2514 'tag inside of CDATA' => array( 2509 2515 'input' => '<![CDATA[This <is> a <strong id="yes">HTML Tag</strong>]]><span>test</span>', 2510 'expected' => '<![CDATA[This <is> a <strong id="yes">HTML Tag</strong>]]><span class="firstTag" foo="bar">test</span>',2516 'expected' => '<![CDATA[This <is> a <strong ">test</span>', 2511 2517 ), 2512 2518 );
Note: See TracChangeset
for help on using the changeset viewer.