etree: in invalid xml chars allow surrogates for 'narrow' pythons
This commit is contained in:
parent
37e4b32f6e
commit
1a94fbc121
@ -224,12 +224,21 @@ except ImportError:
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
# Valid XML strings can include any Unicode character, excluding control
|
# Valid XML strings can include any Unicode character, excluding control
|
||||||
# characters, the surrogate blocks, FFFE, and FFFF.
|
# characters, the surrogate blocks, FFFE, and FFFF:
|
||||||
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
||||||
# This is the inverted regular expression matchin any invalid character
|
# Here we reversed the pattern to match only the invalid characters.
|
||||||
_invalid_xml_string = re.compile(
|
# For the 'narrow' python builds supporting only UCS-2, which represent
|
||||||
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
|
# characters beyond BMP as UTF-16 surrogate pairs, we need to pass through
|
||||||
)
|
# the surrogate block. I haven't found a more elegant solution...
|
||||||
|
UCS2 = sys.maxunicode < 0x10FFFF
|
||||||
|
if UCS2:
|
||||||
|
_invalid_xml_string = re.compile(
|
||||||
|
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE-\uFFFF]"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_invalid_xml_string = re.compile(
|
||||||
|
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
|
||||||
|
)
|
||||||
|
|
||||||
def _tounicode(s):
|
def _tounicode(s):
|
||||||
"""Test if a string is valid user input and decode it to unicode string
|
"""Test if a string is valid user input and decode it to unicode string
|
||||||
|
Loading…
x
Reference in New Issue
Block a user