etree: fix valid XML regex for narrow UCS-2 only pythons

fixes https://ci.appveyor.com/project/adrientetar/ufolib/build/1.0.489/job/epryi911juu5lqdl#L72
This commit is contained in:
Cosimo Lupo 2018-07-14 13:41:19 +01:00
parent 8eb7245773
commit eea1766d9a
No known key found for this signature in database
GPG Key ID: 59D54DB0C9976482

View File

@ -225,9 +225,18 @@ except ImportError:
# any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
_valid_xml_string = re.compile(
"^[\u0009\u000a\u000d\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]+$"
)
UCS2 = sys.maxunicode < 0x10FFFF
if UCS2:
# For 'narrow' python builds we need to match the UTF-16 surrogate pairs
# for the characters beyond the BMP (0x10000..0x10FFFF).
_valid_xml_string = re.compile(
"^(?:[\u0009\u000a\u000d\u0020-\uD7FF\uE000-\uFFFD]|"
"(?:[\uD800-\uDBFF][\uDC00-\uDFFF]))+$"
)
else:
_valid_xml_string = re.compile(
"^[\u0009\u000a\u000d\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]+$"
)
def _tounicode(s):
"""Test if a string is valid user input and decode it to unicode string