From 89700b6e8bd09136944ce919b2fe8d92a08f5667 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 5 May 2026 13:13:24 +0300 Subject: [PATCH 1/2] gh-139489: Add xml.is_valid_text() --- Doc/library/xml.rst | 12 ++++++++++++ Doc/whatsnew/3.15.rst | 4 ++++ Lib/test/test_xml.py | 16 ++++++++++++++++ Lib/xml/utils.py | 12 ++++++++++++ ...026-05-05-13-12-58.gh-issue-139489.a8qqIM.rst | 2 ++ 5 files changed, 46 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2026-05-05-13-12-58.gh-issue-139489.a8qqIM.rst diff --git a/Doc/library/xml.rst b/Doc/library/xml.rst index f9ffaa9a94aacc..23e440cf8a6359 100644 --- a/Doc/library/xml.rst +++ b/Doc/library/xml.rst @@ -57,6 +57,18 @@ This module also defines utility functions. ..versionadded:: next +.. function:: is_valid_text(data) + + Return ``True`` if the string is a sequence of legal XML 1.0 characters, + ``False`` otherwise. + + Almost all characters are permitted in XML 1.0 document, except C0 control + characters (excluding TAB, CR and LF), surrogate characters and special + Unicode characters U+FFFE and U+FFFF. + + ..versionadded:: next + + .. _xml-security: .. _xml-vulnerabilities: diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 828bc1d1d64d0a..ebdfb16e9ef03b 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -1660,6 +1660,10 @@ xml whether a string can be used as an element or attribute name in XML. (Contributed by Serhiy Storchaka in :gh:`139489`.) +* Add the :func:`xml.is_valid_text` function, which allows to check + whether a string can be used in the XML document. + (Contributed by Serhiy Storchaka in :gh:`139489`.) + xml.parsers.expat ----------------- diff --git a/Lib/test/test_xml.py b/Lib/test/test_xml.py index fd3633e43982d7..3a8b92048166f2 100644 --- a/Lib/test/test_xml.py +++ b/Lib/test/test_xml.py @@ -22,6 +22,22 @@ def test_is_valid_name(self): for c in '<>/!?=\x00\x01\x7f\ud800\udfff\ufffe\uffff\U000F0000': self.assertFalse(is_valid_name('name' + c)) + def test_is_valid_text(self): + is_valid_text = xml.is_valid_text + self.assertTrue(is_valid_text('')) + self.assertTrue(is_valid_text('!0Aa_~ \r\n\t\x85\xa0')) + self.assertTrue(is_valid_text('\ud7ff\ue000\ufffd\U00010000\U0010ffff')) + self.assertFalse(is_valid_text('\x00')) + self.assertFalse(is_valid_text('\x01')) + self.assertFalse(is_valid_text('\x1f')) + self.assertTrue(is_valid_text('\x7f')) + self.assertTrue(is_valid_text('\x80')) + self.assertTrue(is_valid_text('\x9f')) + self.assertFalse(is_valid_text('\ud800')) + self.assertFalse(is_valid_text('\udfff')) + self.assertFalse(is_valid_text('\ufffe')) + self.assertFalse(is_valid_text('\uffff')) + if __name__ == '__main__': unittest.main() diff --git a/Lib/xml/utils.py b/Lib/xml/utils.py index c9a0b260675bed..532aa224dae677 100644 --- a/Lib/xml/utils.py +++ b/Lib/xml/utils.py @@ -23,3 +23,15 @@ def is_valid_name(name): '\uF900-\uFDCF\uFDF0-\uFFFD\U00010000-\U000EFFFF' ']*+', name) is not None + +# https://www.w3.org/TR/xml/#charsets +_ILLEGAL_XML_CHAR = ( + '[' + '\x00-\x08\x0B\x0C\x0E-\x1F' # C0 controls except TAB, CR and LF + '\uD800-\uDFFF' # the surrogate blocks + '\uFFFE\uFFFF' # special Unicode characters + ']') + +def is_valid_text(data): + """Test whether a string is a sequence of legal XML 1.0 characters.""" + return _re.search(_ILLEGAL_XML_CHAR, data) is None diff --git a/Misc/NEWS.d/next/Library/2026-05-05-13-12-58.gh-issue-139489.a8qqIM.rst b/Misc/NEWS.d/next/Library/2026-05-05-13-12-58.gh-issue-139489.a8qqIM.rst new file mode 100644 index 00000000000000..b2eeafa55ef9fa --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-05-13-12-58.gh-issue-139489.a8qqIM.rst @@ -0,0 +1,2 @@ +Add the :func:`xml.is_valid_text` function, which allow to check whether +a string can be used in the XML document. From 2523e52309553c591cf467acbfb15f8c368b6ab6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 5 May 2026 19:24:45 +0300 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Stan Ulbrych Co-authored-by: Serhiy Storchaka --- Doc/library/xml.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/library/xml.rst b/Doc/library/xml.rst index 23e440cf8a6359..98be50e15ff463 100644 --- a/Doc/library/xml.rst +++ b/Doc/library/xml.rst @@ -54,7 +54,7 @@ This module also defines utility functions. "!", "?", and "=" are forbidden. The name cannot start with a digit or a character like "-", ".", and "ยท". - ..versionadded:: next + .. versionadded:: next .. function:: is_valid_text(data) @@ -62,11 +62,11 @@ This module also defines utility functions. Return ``True`` if the string is a sequence of legal XML 1.0 characters, ``False`` otherwise. - Almost all characters are permitted in XML 1.0 document, except C0 control + Almost all characters are permitted in XML 1.0 documents, except C0 control characters (excluding TAB, CR and LF), surrogate characters and special Unicode characters U+FFFE and U+FFFF. - ..versionadded:: next + .. versionadded:: next .. _xml-security: