etree: in invalid xml chars allow surrogates for 'narrow' pythons
This commit is contained in:
parent
37e4b32f6e
commit
1a94fbc121
@ -224,12 +224,21 @@ except ImportError:
|
||||
import re
|
||||
|
||||
# Valid XML strings can include any Unicode character, excluding control
|
||||
# characters, the surrogate blocks, FFFE, and FFFF.
|
||||
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
||||
# This is the inverted regular expression matchin any invalid character
|
||||
_invalid_xml_string = re.compile(
|
||||
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
|
||||
)
|
||||
# characters, the surrogate blocks, FFFE, and FFFF:
|
||||
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
||||
# Here we reversed the pattern to match only the invalid characters.
|
||||
# For the 'narrow' python builds supporting only UCS-2, which represent
|
||||
# characters beyond BMP as UTF-16 surrogate pairs, we need to pass through
|
||||
# the surrogate block. I haven't found a more elegant solution...
|
||||
UCS2 = sys.maxunicode < 0x10FFFF
|
||||
if UCS2:
|
||||
_invalid_xml_string = re.compile(
|
||||
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE-\uFFFF]"
|
||||
)
|
||||
else:
|
||||
_invalid_xml_string = re.compile(
|
||||
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
|
||||
)
|
||||
|
||||
def _tounicode(s):
|
||||
"""Test if a string is valid user input and decode it to unicode string
|
||||
|
Loading…
x
Reference in New Issue
Block a user