etree: in invalid xml chars allow surrogates for 'narrow' pythons

2018-07-14 14:31:52 +01:00 · 2018-07-14 14:31:52 +01:00 · 1a94fbc121
commit 1a94fbc121
parent 37e4b32f6e
1 changed files with 15 additions and 6 deletions
--- a/Lib/ufoLib/etree.py
+++ b/Lib/ufoLib/etree.py
@ -224,12 +224,21 @@ except ImportError:
    import re
    # Valid XML strings can include any Unicode character, excluding control
-    # characters, the surrogate blocks, FFFE, and FFFF.
+    # characters, the surrogate blocks, FFFE, and FFFF:
-    # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+    #   Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
-    # This is the inverted regular expression matchin any invalid character
+    # Here we reversed the pattern to match only the invalid characters.
-    _invalid_xml_string = re.compile(
+    # For the 'narrow' python builds supporting only UCS-2, which represent
-        "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
+    # characters beyond BMP as UTF-16 surrogate pairs, we need to pass through
-    )
+    # the surrogate block. I haven't found a more elegant solution...
    UCS2 = sys.maxunicode < 0x10FFFF
    if UCS2:
        _invalid_xml_string = re.compile(
            "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE-\uFFFF]"
        )
    else:
        _invalid_xml_string = re.compile(
            "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
        )
    def _tounicode(s):
        """Test if a string is valid user input and decode it to unicode string