etree: in invalid xml chars allow surrogates for 'narrow' pythons

This commit is contained in:
Cosimo Lupo 2018-07-14 14:31:52 +01:00
parent 37e4b32f6e
commit 1a94fbc121
No known key found for this signature in database
GPG Key ID: 59D54DB0C9976482

View File

@ -224,12 +224,21 @@ except ImportError:
import re
# Valid XML strings can include any Unicode character, excluding control
# characters, the surrogate blocks, FFFE, and FFFF.
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
# This is the inverted regular expression matchin any invalid character
_invalid_xml_string = re.compile(
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
)
# characters, the surrogate blocks, FFFE, and FFFF:
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
# Here we reversed the pattern to match only the invalid characters.
# For the 'narrow' python builds supporting only UCS-2, which represent
# characters beyond BMP as UTF-16 surrogate pairs, we need to pass through
# the surrogate block. I haven't found a more elegant solution...
UCS2 = sys.maxunicode < 0x10FFFF
if UCS2:
_invalid_xml_string = re.compile(
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE-\uFFFF]"
)
else:
_invalid_xml_string = re.compile(
"[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
)
def _tounicode(s):
"""Test if a string is valid user input and decode it to unicode string