[feaLib] Make nameid parsing more robust

We now correctly handle nameid statements with surrogate pairs and old-style macOS-encoded names (provided that fonttools supports the specified encoding). Resolves https://github.com/fonttools/fonttools/issues/842.
2017-02-14 12:39:41 +01:00 · 2017-02-14 12:39:41 +01:00 · eac7ef89c0
commit eac7ef89c0
parent b22df7ff48
5 changed files with 43 additions and 32 deletions
--- a/Lib/fontTools/feaLib/parser.py
+++ b/Lib/fontTools/feaLib/parser.py
@ -2,6 +2,7 @@ from __future__ import print_function, division, absolute_import
 from __future__ import unicode_literals
 from fontTools.feaLib.error import FeatureLibError
 from fontTools.feaLib.lexer import Lexer, IncludingLexer
+from fontTools.misc.encodingTools import getEncoding
 from fontTools.misc.py23 import *
 import fontTools.feaLib.ast as ast
 import logging
@ -863,14 +864,15 @@ class Parser(object):
        langID = None
        if self.next_token_type_ == Lexer.NUMBER:
            platformID = self.expect_number_()
+            location = self.cur_token_location_
            if platformID not in (1, 3):
-                raise FeatureLibError("Expected platform id 1 or 3",
-                                      self.cur_token_location_)
+                raise FeatureLibError("Expected platform id 1 or 3", location)
            if self.next_token_type_ == Lexer.NUMBER:
                platEncID = self.expect_number_()
                langID = self.expect_number_()
        else:
            platformID = 3
+            location = self.cur_token_location_

        if platformID == 1:                # Macintosh
            platEncID = platEncID or 0     # Roman
@ -882,12 +884,11 @@ class Parser(object):
        string = self.expect_string_()
        self.expect_symbol_(";")

-        if platformID == 1 and platEncID == 0:
-            string = self.unescape_mac_name_string(string)
-        elif platformID == 3 and platEncID == 1:
-            string = self.unescape_windows_name_string(string)
-
-        return platformID, platEncID, langID, string
+        encoding = getEncoding(platformID, platEncID, langID)
+        if encoding is None:
+            raise FeatureLibError("Unsupported encoding", location)
+        unescaped = self.unescape_string_(string, encoding)
+        return platformID, platEncID, langID, unescaped

    def parse_nameid_(self):
        assert self.cur_token_ == "nameid", self.cur_token_
@ -905,21 +906,27 @@ class Parser(object):
        return self.ast.NameRecord(location, nameID, platformID, platEncID,
                                   langID, string)

-    def unescape_mac_name_string(self, string):
-        def unescape(match):
-            n = match.group(0)[1:]
-            c = bytechr(int(n, 16)).decode('mac_roman')
-            return c
+    def unescape_string_(self, string, encoding):
+        if encoding == "utf_16_be":
+            s = re.sub(r"\\[0-9a-fA-F]{4}", self.unescape_unichr_, string)
+        else:
+            unescape = lambda m: self.unescape_byte_(m, encoding)
+            s = re.sub(r"\\[0-9a-fA-F]{2}", unescape, string)
+        # We now have a Unicode string, but it might contain surrogate pairs.
+        # We convert surrogates to actual Unicode by round-tripping through
+        # Python's UTF-16 codec in a special mode.
+        utf16 = tobytes(s, "utf_16_be", "surrogatepass")
+        return tounicode(utf16, "utf_16_be")

-        return re.sub(r'\\[0-9a-fA-F]{2}', unescape, string)
+    @staticmethod
+    def unescape_unichr_(match):
+        n = match.group(0)[1:]
+        return unichr(int(n, 16))

-    def unescape_windows_name_string(self, string):
-        def unescape(match):
-            n = match.group(0)[1:]
-            c = unichr(int(n, 16))
-            return c
-
-        return re.sub(r'\\[0-9a-fA-F]{4}', unescape, string)
+    @staticmethod
+    def unescape_byte_(match, encoding):
+        n = match.group(0)[1:]
+        return bytechr(int(n, 16)).decode(encoding)

    def parse_table_BASE_(self, table):
        statements = table.statements
--- a/NEWS.rst
+++ b/NEWS.rst
@ -1,4 +1,7 @@
 - [feaLib] include statements now resolve relative paths like makeotf (#838)
+- [feaLib] `table name` now handles Unicode codepoints beyond the Basic
+  Multilingual Plane, also supports old-style MacOS platform encodings (#842)
+- [feaLib] correctly escape string literals when emitting feature syntax (#780)

 3.7.0 (released 2017-02-11)
 ---------------------------
--- a/Tests/feaLib/data/spec8b.fea
+++ b/Tests/feaLib/data/spec8b.fea
@ -4,5 +4,9 @@ feature size {
 # 139 - range end (inclusive, decipoints)
   sizemenuname "Win MinionPro Size Name";
   sizemenuname 1 "Mac MinionPro Size Name";
-   sizemenuname 1 21 0 "Mac MinionPro Size Name";
+   # The specification says: sizemenuname 1 21 0 "Mac MinionPro Size Name";
+   # which means Macintosh platform, MacOS Thai encoding, English language.
+   # Since fonttools currently does not support the MacOS Thai encoding,
+   # we use instead MacOS Roman encoding (0), Swedish language (5) for our test.
+   sizemenuname 1 0 5 "Mac MinionPro Size Name";
 } size;
--- a/Tests/feaLib/data/spec8b.ttx
+++ b/Tests/feaLib/data/spec8b.ttx
@ -8,7 +8,7 @@
    <namerecord nameID="256" platformID="1" platEncID="0" langID="0x0" unicode="True">
      Mac MinionPro Size Name
    </namerecord>
-    <namerecord nameID="256" platformID="1" platEncID="21" langID="0x0" unicode="True">
+    <namerecord nameID="256" platformID="1" platEncID="0" langID="0x5" unicode="True">
      Mac MinionPro Size Name
    </namerecord>
  </name>
--- a/Tests/feaLib/parser_test.py
+++ b/Tests/feaLib/parser_test.py
@ -928,12 +928,10 @@ class ParserTest(unittest.TestCase):
        self.assertEquals(name.asFea(), r'nameid 9 "Quotation \0022Mark\0022";')

    def test_nameid_windows_utf16_surroates(self):
-        pass
-        # TODO: https://github.com/fonttools/fonttools/issues/842
-        # doc = self.parse(r'table name { nameid 9 "Carrot \D83E\DD55"; } name;')
-        # name = doc.statements[0].statements[0]
-        # self.assertEquals(name.string, r"Carrot 🥕")
-        # self.assertEquals(name.asFea(), r'nameid 9 "Carrot \d83e\dd55";')
+        doc = self.parse(r'table name { nameid 9 "Carrot \D83E\DD55"; } name;')
+        name = doc.statements[0].statements[0]
+        self.assertEquals(name.string, r"Carrot 🥕")
+        self.assertEquals(name.asFea(), r'nameid 9 "Carrot \d83e\dd55";')

    def test_nameid_mac_roman(self):
        doc = self.parse(
@ -956,9 +954,8 @@ class ParserTest(unittest.TestCase):
        self.assertEquals(name.platformID, 1)
        self.assertEquals(name.platEncID, 0)
        self.assertEquals(name.langID, 18)
-        # TODO: https://github.com/fonttools/fonttools/issues/842
-        # self.assertEquals(name.string, "Jovica Veljović")
-        # self.assertEquals(name.asFea(), r'nameid 9 1 0 18 "Jovica Veljovi\e6";')
+        self.assertEquals(name.string, "Jovica Veljović")
+        self.assertEquals(name.asFea(), r'nameid 9 1 0 18 "Jovica Veljovi\e6";')

    def test_nameid_unsupported_platform(self):
        self.assertRaisesRegex(