diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php index f59f877775b77..4fe0dba9a9bd4 100644 --- a/src/wp-includes/formatting.php +++ b/src/wp-includes/formatting.php @@ -4760,6 +4760,16 @@ function esc_textarea( $text ) { function esc_xml( $text ) { $safe_text = wp_check_invalid_utf8( $text ); + // Strip invalid XML characters. + $is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ); + if ( $is_utf8 ) { + $safe_text = preg_replace( + '/[^\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', + '', + $safe_text + ); + } + $cdata_regex = '\<\!\[CDATA\[.*?\]\]\>'; $regex = <<assertSame( $expected, $actual ); + } + + /** + * Data provider for `test_strips_invalid_xml_characters()`. + * + * @return array { + * @type string $source The source string containing invalid XML characters. + * @type string $expected The expected string with invalid characters removed. + * } + */ + public function data_strips_invalid_xml_characters() { + return array( + // Vertical tab (0x0B) - invalid in XML. + array( + "This contains a vertical tab\x0Bcharacter", + 'This contains a vertical tabcharacter', + ), + // File separator (0x1C) - invalid in XML. + array( + "File separator\x1Ctest", + 'File separatortest', + ), + // NULL byte (0x00) - invalid in XML. + array( + "Text with\x00null byte", + 'Text withnull byte', + ), + // Bell character (0x07) - invalid in XML. + array( + "Bell\x07character", + 'Bellcharacter', + ), + // Multiple invalid characters. + array( + "Multiple\x00invalid\x0B\x1Ccharacters\x07here", + 'Multipleinvalidcharactershere', + ), + // Valid control characters should be preserved: tab (0x09), LF (0x0A), CR (0x0D). + array( + "Tab\tlinefeed\ncarriage return\rtest", + "Tab\tlinefeed\ncarriage return\rtest", + ), + // Mix of valid and invalid. + array( + "Valid\ttab but\x0Binvalid vertical tab", + "Valid\ttab butinvalid vertical tab", + ), + // Text without invalid characters should remain unchanged. + array( + 'Normal text with spaces and punctuation!', + 'Normal text with spaces and punctuation!', + ), + // Unicode characters in valid range should be preserved. + array( + 'Unicode: café, naïve, 日本語', + 'Unicode: café, naïve, 日本語', + ), + ); + } + + /** + * Test that invalid XML characters within CDATA sections are also stripped. + */ + public function test_strips_invalid_xml_characters_outside_cdata() { + update_option( 'blog_charset', 'UTF-8' ); + $source = "Text\x0Bwith]]>and\x1Cmore\x00invalid"; + $expected = 'Textwith]]>andmoreinvalid'; + $actual = esc_xml( $source ); + $this->assertSame( $expected, $actual ); + } + + /** + * Test that the function works correctly when charset is not UTF-8. + */ + public function test_non_utf8_charset_skips_invalid_character_stripping() { + update_option( 'blog_charset', 'ISO-8859-1' ); + $source = "Test\x0Btext"; + $actual = esc_xml( $source ); + $this->assertIsString( $actual ); + } }