etree: simplify regex for invalid xml chars

This commit is contained in:
Cosimo Lupo 2018-07-14 14:07:42 +01:00
parent d4f10f121a
commit 5f61bcfcb4
No known key found for this signature in database
GPG Key ID: 59D54DB0C9976482

View File

@ -223,19 +223,12 @@ except ImportError:
import re import re
# any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. # Valid XML strings can include any Unicode character, excluding control
# characters, the surrogate blocks, FFFE, and FFFF.
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
UCS2 = sys.maxunicode < 0x10FFFF # This is the inverted regular expression matchin any invalid character
if UCS2: _invalid_xml_string = re.compile(
# For 'narrow' python builds we need to match the UTF-16 surrogate pairs "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
# for the characters beyond the BMP (0x10000..0x10FFFF).
_valid_xml_string = re.compile(
"^(?:[\u0009\u000a\u000d\u0020-\uD7FF\uE000-\uFFFD]|"
"(?:[\uD800-\uDBFF][\uDC00-\uDFFF]))+$"
)
else:
_valid_xml_string = re.compile(
"^[\u0009\u000a\u000d\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]+$"
) )
def _tounicode(s): def _tounicode(s):
@ -248,7 +241,7 @@ except ImportError:
s = tounicode(s) s = tounicode(s)
except AttributeError: except AttributeError:
_raise_serialization_error(s) _raise_serialization_error(s)
if s and not _valid_xml_string.match(s): if s and _invalid_xml_string.search(s):
raise ValueError( raise ValueError(
"All strings must be XML compatible: Unicode or ASCII, " "All strings must be XML compatible: Unicode or ASCII, "
"no NULL bytes or control characters" "no NULL bytes or control characters"