[feaLib] Make nameid parsing more robust

We now correctly handle nameid statements with surrogate pairs and
old-style macOS-encoded names (provided that fonttools supports the
specified encoding).

Resolves https://github.com/fonttools/fonttools/issues/842.
This commit is contained in:
Sascha Brawer 2017-02-14 12:39:41 +01:00
parent b22df7ff48
commit eac7ef89c0
5 changed files with 43 additions and 32 deletions

View File

@ -2,6 +2,7 @@ from __future__ import print_function, division, absolute_import
from __future__ import unicode_literals
from fontTools.feaLib.error import FeatureLibError
from fontTools.feaLib.lexer import Lexer, IncludingLexer
from fontTools.misc.encodingTools import getEncoding
from fontTools.misc.py23 import *
import fontTools.feaLib.ast as ast
import logging
@ -863,14 +864,15 @@ class Parser(object):
langID = None
if self.next_token_type_ == Lexer.NUMBER:
platformID = self.expect_number_()
location = self.cur_token_location_
if platformID not in (1, 3):
raise FeatureLibError("Expected platform id 1 or 3",
self.cur_token_location_)
raise FeatureLibError("Expected platform id 1 or 3", location)
if self.next_token_type_ == Lexer.NUMBER:
platEncID = self.expect_number_()
langID = self.expect_number_()
else:
platformID = 3
location = self.cur_token_location_
if platformID == 1: # Macintosh
platEncID = platEncID or 0 # Roman
@ -882,12 +884,11 @@ class Parser(object):
string = self.expect_string_()
self.expect_symbol_(";")
if platformID == 1 and platEncID == 0:
string = self.unescape_mac_name_string(string)
elif platformID == 3 and platEncID == 1:
string = self.unescape_windows_name_string(string)
return platformID, platEncID, langID, string
encoding = getEncoding(platformID, platEncID, langID)
if encoding is None:
raise FeatureLibError("Unsupported encoding", location)
unescaped = self.unescape_string_(string, encoding)
return platformID, platEncID, langID, unescaped
def parse_nameid_(self):
assert self.cur_token_ == "nameid", self.cur_token_
@ -905,21 +906,27 @@ class Parser(object):
return self.ast.NameRecord(location, nameID, platformID, platEncID,
langID, string)
def unescape_mac_name_string(self, string):
def unescape(match):
n = match.group(0)[1:]
c = bytechr(int(n, 16)).decode('mac_roman')
return c
def unescape_string_(self, string, encoding):
if encoding == "utf_16_be":
s = re.sub(r"\\[0-9a-fA-F]{4}", self.unescape_unichr_, string)
else:
unescape = lambda m: self.unescape_byte_(m, encoding)
s = re.sub(r"\\[0-9a-fA-F]{2}", unescape, string)
# We now have a Unicode string, but it might contain surrogate pairs.
# We convert surrogates to actual Unicode by round-tripping through
# Python's UTF-16 codec in a special mode.
utf16 = tobytes(s, "utf_16_be", "surrogatepass")
return tounicode(utf16, "utf_16_be")
return re.sub(r'\\[0-9a-fA-F]{2}', unescape, string)
@staticmethod
def unescape_unichr_(match):
n = match.group(0)[1:]
return unichr(int(n, 16))
def unescape_windows_name_string(self, string):
def unescape(match):
n = match.group(0)[1:]
c = unichr(int(n, 16))
return c
return re.sub(r'\\[0-9a-fA-F]{4}', unescape, string)
@staticmethod
def unescape_byte_(match, encoding):
n = match.group(0)[1:]
return bytechr(int(n, 16)).decode(encoding)
def parse_table_BASE_(self, table):
statements = table.statements

View File

@ -1,4 +1,7 @@
- [feaLib] include statements now resolve relative paths like makeotf (#838)
- [feaLib] `table name` now handles Unicode codepoints beyond the Basic
Multilingual Plane, also supports old-style MacOS platform encodings (#842)
- [feaLib] correctly escape string literals when emitting feature syntax (#780)
3.7.0 (released 2017-02-11)
---------------------------

View File

@ -4,5 +4,9 @@ feature size {
# 139 - range end (inclusive, decipoints)
sizemenuname "Win MinionPro Size Name";
sizemenuname 1 "Mac MinionPro Size Name";
sizemenuname 1 21 0 "Mac MinionPro Size Name";
# The specification says: sizemenuname 1 21 0 "Mac MinionPro Size Name";
# which means Macintosh platform, MacOS Thai encoding, English language.
# Since fonttools currently does not support the MacOS Thai encoding,
# we use instead MacOS Roman encoding (0), Swedish language (5) for our test.
sizemenuname 1 0 5 "Mac MinionPro Size Name";
} size;

View File

@ -8,7 +8,7 @@
<namerecord nameID="256" platformID="1" platEncID="0" langID="0x0" unicode="True">
Mac MinionPro Size Name
</namerecord>
<namerecord nameID="256" platformID="1" platEncID="21" langID="0x0" unicode="True">
<namerecord nameID="256" platformID="1" platEncID="0" langID="0x5" unicode="True">
Mac MinionPro Size Name
</namerecord>
</name>

View File

@ -928,12 +928,10 @@ class ParserTest(unittest.TestCase):
self.assertEquals(name.asFea(), r'nameid 9 "Quotation \0022Mark\0022";')
def test_nameid_windows_utf16_surroates(self):
pass
# TODO: https://github.com/fonttools/fonttools/issues/842
# doc = self.parse(r'table name { nameid 9 "Carrot \D83E\DD55"; } name;')
# name = doc.statements[0].statements[0]
# self.assertEquals(name.string, r"Carrot 🥕")
# self.assertEquals(name.asFea(), r'nameid 9 "Carrot \d83e\dd55";')
doc = self.parse(r'table name { nameid 9 "Carrot \D83E\DD55"; } name;')
name = doc.statements[0].statements[0]
self.assertEquals(name.string, r"Carrot 🥕")
self.assertEquals(name.asFea(), r'nameid 9 "Carrot \d83e\dd55";')
def test_nameid_mac_roman(self):
doc = self.parse(
@ -956,9 +954,8 @@ class ParserTest(unittest.TestCase):
self.assertEquals(name.platformID, 1)
self.assertEquals(name.platEncID, 0)
self.assertEquals(name.langID, 18)
# TODO: https://github.com/fonttools/fonttools/issues/842
# self.assertEquals(name.string, "Jovica Veljović")
# self.assertEquals(name.asFea(), r'nameid 9 1 0 18 "Jovica Veljovi\e6";')
self.assertEquals(name.string, "Jovica Veljović")
self.assertEquals(name.asFea(), r'nameid 9 1 0 18 "Jovica Veljovi\e6";')
def test_nameid_unsupported_platform(self):
self.assertRaisesRegex(