Revamp name table Unicode handling some more

Part of https://github.com/behdad/fonttools/issues/236 Now we fallback to ASCII for unknown encodings. Not sure if this might be a bad idea. The main user-visible difference is that if there's an ASCII-only text in an unknown encoding, we still "decode" it and use unicode="True" instead of unicode="False". Or is assuming that any unsupported encoding is ASCII-compatible too intrusive?
2015-04-16 17:09:49 -07:00 · 2015-04-16 17:09:49 -07:00 · ba0a3b9abb
commit ba0a3b9abb
parent f419f572db
3 changed files with 57 additions and 30 deletions
--- a/Lib/fontTools/misc/py23.py
+++ b/Lib/fontTools/misc/py23.py
@ -31,14 +31,14 @@ except ImportError:
 def strjoin(iterable, joiner=''):
 	return tostr(joiner).join(iterable)

-def tobytes(s, encoding='ascii'):
+def tobytes(s, encoding='ascii', errors='strict'):
 	if not isinstance(s, bytes):
-		return s.encode(encoding)
+		return s.encode(encoding, errors)
 	else:
 		return s
-def tounicode(s, encoding='ascii'):
+def tounicode(s, encoding='ascii', errors='strict'):
 	if not isinstance(s, unicode):
-		return s.decode(encoding)
+		return s.decode(encoding, errors)
 	else:
 		return s

--- a/Lib/fontTools/ttLib/tables/_n_a_m_e.py
+++ b/Lib/fontTools/ttLib/tables/_n_a_m_e.py
@ -150,37 +150,61 @@ class NameRecord(object):
 		},
 	}

-	def getEncoding(self):
-		encoding = self._encodingMap.get(self.platformID, {}).get(self.platEncID, None)
+	def getEncoding(self, default='ascii'):
+		"""Returns the Python encoding name for this name entry based on its platformID,
+		platEncID, and langID.  If encoding for these values is not known, by default
+		'ascii' is returned.  That can be overriden by passing a value to the default
+		argument.
+		"""
+		encoding = self._encodingMap.get(self.platformID, {}).get(self.platEncID, default)
 		if isinstance(encoding, dict):
 			encoding = encoding.get(self.langID, encoding[Ellipsis])
 		return encoding

 	def encodingIsUnicodeCompatible(self):
-		return self.getEncoding() in ['utf-16be', 'ucs2be', 'ascii', 'latin1']
+		return self.getEncoding(None) in ['utf-16be', 'ucs2be', 'ascii', 'latin1']

 	def __str__(self):
-		unistr = self.toUnicode()
-		if unistr != None:
-			return unistr
-		else:
+		try:
+			return self.toUnicode()
+		except UnicodeDecodeError:
 			return str(self.string)

 	def isUnicode(self):
 		return (self.platformID == 0 or
 			(self.platformID == 3 and self.platEncID in [0, 1, 10]))

-	def toUnicode(self):
-		encoding = self.getEncoding()
-		if encoding == None:
-			return None
-		return tounicode(self.string, encoding=encoding)
+	def toUnicode(self, errors='strict'):
+		"""
+		If self.string is a Unicode string, return it; otherwise try decoding the
+		bytes in self.string to a Unicode string using the encoding of this
+		entry as returned by self.getEncoding(); Note that  self.getEncoding()
+		returns 'ascii' if the encoding is unknown to the library.

-	def toBytes(self):
-		return tobytes(self.string, encoding=self.getEncoding())
+		If the bytes are ill-formed in that chosen encoding, the error is handled
+		according to the errors parameter to this function, which is passed to the
+		underlying decode() function; by default it throws a UnicodeDecodeError exception.
+		"""
+		return tounicode(self.string, encoding=self.getEncoding(), errors=errors)
+
+	def toBytes(self, errors='strict'):
+		""" If self.string is a bytes object, return it; otherwise try encoding
+		the Unicode string in self.string to bytes using the encoding of this
+		entry as returned by self.getEncoding(); Note that self.getEncoding()
+		returns 'ascii' if the encoding is unknown to the library.
+
+		If the Unicode string cannot be encoded to bytes in the chosen encoding,
+		the error is handled according to the errors parameter to this function,
+		which is passed to the underlying encode() function; by default it throws a
+		UnicodeEncodeError exception.
+		"""
+		return tobytes(self.string, encoding=self.getEncoding(), errors=errors)

 	def toXML(self, writer, ttFont):
-		unistr = self.toUnicode()
+		try:
+			unistr = self.toUnicode()
+		except UnicodeDecodeError:
+			unistr = None
 		attrs = [
 				("nameID", self.nameID),
 				("platformID", self.platformID),
--- a/Lib/fontTools/ttLib/tables/_n_a_m_e_test.py
+++ b/Lib/fontTools/ttLib/tables/_n_a_m_e_test.py
@ -9,7 +9,7 @@ class NameRecordTest(unittest.TestCase):
 	def makeName(self, text, nameID, platformID, platEncID, langID):
 		name = NameRecord()
 		name.nameID, name.platformID, name.platEncID, name.langID = (nameID, platformID, platEncID, langID)
-		name.string = text.encode(name.getEncoding())
+		name.string = tobytes(text, encoding=name.getEncoding())
 		return name

 	def test_toUnicode_utf16be(self):
@ -19,7 +19,7 @@ class NameRecordTest(unittest.TestCase):

 	def test_toUnicode_macroman(self):
 		name = self.makeName("Foo Italic", 222, 1, 0, 7)  # MacRoman
-		self.assertEqual("macroman", name.getEncoding())
+		self.assertEqual("mac-roman", name.getEncoding())
 		self.assertEqual("Foo Italic", name.toUnicode())

 	def test_toUnicode_UnicodeDecodeError(self):
@ -50,19 +50,24 @@ class NameRecordTest(unittest.TestCase):
                    '</namerecord>'
 		], self.toXML(name))

-	def test_toXML_unknownPlatEncID(self):
-		name = NameRecord()
-		name.string = b"B\x8arli"
-		name.nameID, name.platformID, name.platEncID, name.langID = (333, 1, 9876, 7)
+	def test_toXML_unknownPlatEncID_nonASCII(self):
+		name = self.makeName(b"B\x8arli", 333, 1, 9876, 7) # Unknown Mac encodingID
 		self.assertEqual([
                    '<namerecord nameID="333" platformID="1" platEncID="9876" langID="0x7" unicode="False">',
                    '  B&#138;rli',
                    '</namerecord>'
 		], self.toXML(name))

+	def test_toXML_unknownPlatEncID_ASCII(self):
+		name = self.makeName(b"Barli", 333, 1, 9876, 7) # Unknown Mac encodingID
+		self.assertEqual([
+                    '<namerecord nameID="333" platformID="1" platEncID="9876" langID="0x7" unicode="True">',
+                    '  Barli',
+                    '</namerecord>'
+		], self.toXML(name))
+
 	def test_encoding_macroman_misc(self):
-		name = NameRecord()
-		name.nameID, name.platformID, name.platEncID, name.langID = (123, 1, 0, 17)
+		name = self.makeName('', 123, 1, 0, 17) # Mac Turkish
 		self.assertEqual(name.getEncoding(), "mac-turkish")
 		name.langID = 37
 		self.assertEqual(name.getEncoding(), None)
@ -70,9 +75,7 @@ class NameRecordTest(unittest.TestCase):
 		self.assertEqual(name.getEncoding(), "mac-roman")

 	def test_extended_mac_encodings(self):
-		name = NameRecord()
-		name.string = b"\xfe"
-		name.nameID, name.platformID, name.platEncID, name.langID = (123, 1, 1, 0)
+		name = self.makeName(b'\xfe', 123, 1, 1, 0) # Mac Japanese
 		self.assertEqual(name.toUnicode(), unichr(0x2122))

 if __name__ == "__main__":