Implement heuristics for decoding odd-length UTF-16BE data in name table

Part of https://github.com/behdad/fonttools/issues/249
2015-04-22 01:49:15 -07:00 · 2015-04-22 01:49:15 -07:00 · 7e4092aba8
commit 7e4092aba8
parent abf7dc6568
2 changed files with 41 additions and 9 deletions
--- a/Lib/fontTools/ttLib/tables/_n_a_m_e.py
+++ b/Lib/fontTools/ttLib/tables/_n_a_m_e.py
@ -121,11 +121,36 @@ class NameRecord(object):
 		entry as returned by self.getEncoding(); Note that  self.getEncoding()
 		returns 'ascii' if the encoding is unknown to the library.

-		If the bytes are ill-formed in that chosen encoding, the error is handled
-		according to the errors parameter to this function, which is passed to the
-		underlying decode() function; by default it throws a UnicodeDecodeError exception.
+		Certain heuristics are performed to recover data from bytes that are
+		ill-formed in the chosen encoding, or that otherwise look misencoded
+		(mostly around bad UTF-16BE encoded bytes, or bytes that look like UTF-16BE
+		but marked otherwise).  If the bytes are ill-formed and the heuristics fail,
+		the error is handled according to the errors parameter to this function, which is
+		passed to the underlying decode() function; by default it throws a
+		UnicodeDecodeError exception.
+
+		Note: The mentioned heuristics mean that roundtripping a font to XML and back
+		to binary might recover some misencoded data whereas just loading the font
+		and saving it back will not change them.
 		"""
-		return tounicode(self.string, encoding=self.getEncoding(), errors=errors)
+		encoding = self.getEncoding()
+		string = self.string
+		if encoding == 'utf_16be' and len(string) % 2 == 1:
+			# Recover badly encoded UTF-16 strings that have an odd number of bytes:
+			# - If the last byte is zero, drop it.  Otherwise,
+			# - If all the odd bytes are zero and all the even bytes are ASCII,
+			#   prepend one zero byte.  Otherwise,
+			# - If first byte is zero and all other bytes are ASCII, insert zero
+			#   bytes between consecutive ASCII bytes.
+			#
+			# (Yes, I've seen all of these in the wild... sigh)
+			if byteord(string[-1]) == 0:
+				string = string[:-1]
+			elif all(byteord(b) == 0 if i % 2 else byteord(b) >= 0x20 for i,b in enumerate(string)):
+				string = b'\0' + string
+			elif byteord(string[0]) == 0 and all(byteord(b) >= 0x20 for b in string[1:]):
+				string = bytesjoin(b'\0'+bytechr(byteord(b)) for b in string[1:])
+		return tounicode(string, encoding=encoding, errors=errors)

 	def toBytes(self, errors='strict'):
 		""" If self.string is a bytes object, return it; otherwise try encoding
--- a/Lib/fontTools/ttLib/tables/_n_a_m_e_test.py
+++ b/Lib/fontTools/ttLib/tables/_n_a_m_e_test.py
@ -28,9 +28,8 @@ class NameRecordTest(unittest.TestCase):
 		self.assertEqual("Foo Italic"+unichr(0x02DA), name.toUnicode())

 	def test_toUnicode_UnicodeDecodeError(self):
-		name = self.makeName("Foo Bold", 111, 0, 2, 7)
+		name = self.makeName(b"\1", 111, 0, 2, 7)
 		self.assertEqual("utf_16be", name.getEncoding())
-		name.string = b"X"  # invalid utf_16be sequence
 		self.assertRaises(UnicodeDecodeError, name.toUnicode)

 	def toXML(self, name):
@ -47,11 +46,19 @@ class NameRecordTest(unittest.TestCase):
                    '</namerecord>'
 		], self.toXML(name))

-	def test_toXML_utf16be_broken(self):
+	def test_toXML_utf16be_odd_length1(self):
 		name = self.makeName(b"\0F\0o\0o\0", 111, 0, 2, 7)
 		self.assertEqual([
-                    '<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7" unicode="False">',
-                    '  &#0;F&#0;o&#0;o&#0;',
+                    '<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7">',
+                    '  Foo',
+                    '</namerecord>'
+		], self.toXML(name))
+
+	def test_toXML_utf16be_odd_length2(self):
+		name = self.makeName(b"\0Fooz", 111, 0, 2, 7)
+		self.assertEqual([
+                    '<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7">',
+                    '  Fooz',
                    '</namerecord>'
 		], self.toXML(name))