[feaLib] Make nameid parsing more robust

We now correctly handle nameid statements with surrogate pairs and old-style macOS-encoded names (provided that fonttools supports the specified encoding). Resolves https://github.com/fonttools/fonttools/issues/842.
2017-02-14 12:39:41 +01:00 · 2017-02-14 12:39:41 +01:00 · eac7ef89c0
commit eac7ef89c0
parent b22df7ff48
5 changed files with 43 additions and 32 deletions
--- a/Lib/fontTools/feaLib/parser.py
+++ b/Lib/fontTools/feaLib/parser.py
@ -2,6 +2,7 @@ from __future__ import print_function, division, absolute_import
 from __future__ import unicode_literals
 from fontTools.feaLib.error import FeatureLibError
 from fontTools.feaLib.lexer import Lexer, IncludingLexer
 from fontTools.misc.encodingTools import getEncoding
 from fontTools.misc.py23 import *
 import fontTools.feaLib.ast as ast
 import logging
@ -863,14 +864,15 @@ class Parser(object):
        langID = None
        if self.next_token_type_ == Lexer.NUMBER:
            platformID = self.expect_number_()
            location = self.cur_token_location_
            if platformID not in (1, 3):
-                raise FeatureLibError("Expected platform id 1 or 3",
+                raise FeatureLibError("Expected platform id 1 or 3", location)
                                      self.cur_token_location_)
            if self.next_token_type_ == Lexer.NUMBER:
                platEncID = self.expect_number_()
                langID = self.expect_number_()
        else:
            platformID = 3
            location = self.cur_token_location_
        if platformID == 1:                # Macintosh
            platEncID = platEncID or 0     # Roman
@ -882,12 +884,11 @@ class Parser(object):
        string = self.expect_string_()
        self.expect_symbol_(";")
-        if platformID == 1 and platEncID == 0:
+        encoding = getEncoding(platformID, platEncID, langID)
-            string = self.unescape_mac_name_string(string)
+        if encoding is None:
-        elif platformID == 3 and platEncID == 1:
+            raise FeatureLibError("Unsupported encoding", location)
-            string = self.unescape_windows_name_string(string)
+        unescaped = self.unescape_string_(string, encoding)
-
+        return platformID, platEncID, langID, unescaped
        return platformID, platEncID, langID, string
    def parse_nameid_(self):
        assert self.cur_token_ == "nameid", self.cur_token_
@ -905,21 +906,27 @@ class Parser(object):
        return self.ast.NameRecord(location, nameID, platformID, platEncID,
                                   langID, string)
-    def unescape_mac_name_string(self, string):
+    def unescape_string_(self, string, encoding):
-        def unescape(match):
+        if encoding == "utf_16_be":
            s = re.sub(r"\\[0-9a-fA-F]{4}", self.unescape_unichr_, string)
        else:
            unescape = lambda m: self.unescape_byte_(m, encoding)
            s = re.sub(r"\\[0-9a-fA-F]{2}", unescape, string)
        # We now have a Unicode string, but it might contain surrogate pairs.
        # We convert surrogates to actual Unicode by round-tripping through
        # Python's UTF-16 codec in a special mode.
        utf16 = tobytes(s, "utf_16_be", "surrogatepass")
        return tounicode(utf16, "utf_16_be")
    @staticmethod
    def unescape_unichr_(match):
        n = match.group(0)[1:]
-            c = bytechr(int(n, 16)).decode('mac_roman')
+        return unichr(int(n, 16))
            return c
-        return re.sub(r'\\[0-9a-fA-F]{2}', unescape, string)
+    @staticmethod
-
+    def unescape_byte_(match, encoding):
    def unescape_windows_name_string(self, string):
        def unescape(match):
        n = match.group(0)[1:]
-            c = unichr(int(n, 16))
+        return bytechr(int(n, 16)).decode(encoding)
            return c
        return re.sub(r'\\[0-9a-fA-F]{4}', unescape, string)
    def parse_table_BASE_(self, table):
        statements = table.statements
--- a/NEWS.rst
+++ b/NEWS.rst
@ -1,4 +1,7 @@
 - [feaLib] include statements now resolve relative paths like makeotf (#838)
 - [feaLib] `table name` now handles Unicode codepoints beyond the Basic
  Multilingual Plane, also supports old-style MacOS platform encodings (#842)
 - [feaLib] correctly escape string literals when emitting feature syntax (#780)
 3.7.0 (released 2017-02-11)
 ---------------------------
--- a/Tests/feaLib/data/spec8b.fea
+++ b/Tests/feaLib/data/spec8b.fea
@ -4,5 +4,9 @@ feature size {
 # 139 - range end (inclusive, decipoints)
   sizemenuname "Win MinionPro Size Name";
   sizemenuname 1 "Mac MinionPro Size Name";
-   sizemenuname 1 21 0 "Mac MinionPro Size Name";
+   # The specification says: sizemenuname 1 21 0 "Mac MinionPro Size Name";
   # which means Macintosh platform, MacOS Thai encoding, English language.
   # Since fonttools currently does not support the MacOS Thai encoding,
   # we use instead MacOS Roman encoding (0), Swedish language (5) for our test.
   sizemenuname 1 0 5 "Mac MinionPro Size Name";
 } size;
--- a/Tests/feaLib/data/spec8b.ttx
+++ b/Tests/feaLib/data/spec8b.ttx
@ -8,7 +8,7 @@
    <namerecord nameID="256" platformID="1" platEncID="0" langID="0x0" unicode="True">
      Mac MinionPro Size Name
    </namerecord>
-    <namerecord nameID="256" platformID="1" platEncID="21" langID="0x0" unicode="True">
+    <namerecord nameID="256" platformID="1" platEncID="0" langID="0x5" unicode="True">
      Mac MinionPro Size Name
    </namerecord>
  </name>
--- a/Tests/feaLib/parser_test.py
+++ b/Tests/feaLib/parser_test.py
@ -928,12 +928,10 @@ class ParserTest(unittest.TestCase):
        self.assertEquals(name.asFea(), r'nameid 9 "Quotation \0022Mark\0022";')
    def test_nameid_windows_utf16_surroates(self):
-        pass
+        doc = self.parse(r'table name { nameid 9 "Carrot \D83E\DD55"; } name;')
-        # TODO: https://github.com/fonttools/fonttools/issues/842
+        name = doc.statements[0].statements[0]
-        # doc = self.parse(r'table name { nameid 9 "Carrot \D83E\DD55"; } name;')
+        self.assertEquals(name.string, r"Carrot 🥕")
-        # name = doc.statements[0].statements[0]
+        self.assertEquals(name.asFea(), r'nameid 9 "Carrot \d83e\dd55";')
        # self.assertEquals(name.string, r"Carrot 🥕")
        # self.assertEquals(name.asFea(), r'nameid 9 "Carrot \d83e\dd55";')
    def test_nameid_mac_roman(self):
        doc = self.parse(
@ -956,9 +954,8 @@ class ParserTest(unittest.TestCase):
        self.assertEquals(name.platformID, 1)
        self.assertEquals(name.platEncID, 0)
        self.assertEquals(name.langID, 18)
-        # TODO: https://github.com/fonttools/fonttools/issues/842
+        self.assertEquals(name.string, "Jovica Veljović")
-        # self.assertEquals(name.string, "Jovica Veljović")
+        self.assertEquals(name.asFea(), r'nameid 9 1 0 18 "Jovica Veljovi\e6";')
        # self.assertEquals(name.asFea(), r'nameid 9 1 0 18 "Jovica Veljovi\e6";')
    def test_nameid_unsupported_platform(self):
        self.assertRaisesRegex(