etree: in invalid xml chars allow surrogates for 'narrow' pythons

2018-07-14 14:31:52 +01:00 · 2018-07-14 14:31:52 +01:00 · 1a94fbc121
commit 1a94fbc121
parent 37e4b32f6e
1 changed files with 15 additions and 6 deletions
--- a/Lib/ufoLib/etree.py
+++ b/Lib/ufoLib/etree.py
@ -224,12 +224,21 @@ except ImportError:
    import re

    # Valid XML strings can include any Unicode character, excluding control
-    # characters, the surrogate blocks, FFFE, and FFFF.
-    # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
-    # This is the inverted regular expression matchin any invalid character
-    _invalid_xml_string = re.compile(
-        "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
-    )
+    # characters, the surrogate blocks, FFFE, and FFFF:
+    #   Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+    # Here we reversed the pattern to match only the invalid characters.
+    # For the 'narrow' python builds supporting only UCS-2, which represent
+    # characters beyond BMP as UTF-16 surrogate pairs, we need to pass through
+    # the surrogate block. I haven't found a more elegant solution...
+    UCS2 = sys.maxunicode < 0x10FFFF
+    if UCS2:
+        _invalid_xml_string = re.compile(
+            "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uFFFE-\uFFFF]"
+        )
+    else:
+        _invalid_xml_string = re.compile(
+            "[\u0000-\u0008\u000B-\u000C\u000E-\u001F\uD800-\uDFFF\uFFFE-\uFFFF]"
+        )

    def _tounicode(s):
        """Test if a string is valid user input and decode it to unicode string