From 7e4092aba80b0daca5db87ab099fb63578a9144f Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Wed, 22 Apr 2015 01:49:15 -0700 Subject: [PATCH] Implement heuristics for decoding odd-length UTF-16BE data in name table Part of https://github.com/behdad/fonttools/issues/249 --- Lib/fontTools/ttLib/tables/_n_a_m_e.py | 33 ++++++++++++++++++--- Lib/fontTools/ttLib/tables/_n_a_m_e_test.py | 17 +++++++---- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/Lib/fontTools/ttLib/tables/_n_a_m_e.py b/Lib/fontTools/ttLib/tables/_n_a_m_e.py index 6856bb9a1..257e83b47 100644 --- a/Lib/fontTools/ttLib/tables/_n_a_m_e.py +++ b/Lib/fontTools/ttLib/tables/_n_a_m_e.py @@ -121,11 +121,36 @@ class NameRecord(object): entry as returned by self.getEncoding(); Note that self.getEncoding() returns 'ascii' if the encoding is unknown to the library. - If the bytes are ill-formed in that chosen encoding, the error is handled - according to the errors parameter to this function, which is passed to the - underlying decode() function; by default it throws a UnicodeDecodeError exception. + Certain heuristics are performed to recover data from bytes that are + ill-formed in the chosen encoding, or that otherwise look misencoded + (mostly around bad UTF-16BE encoded bytes, or bytes that look like UTF-16BE + but marked otherwise). If the bytes are ill-formed and the heuristics fail, + the error is handled according to the errors parameter to this function, which is + passed to the underlying decode() function; by default it throws a + UnicodeDecodeError exception. + + Note: The mentioned heuristics mean that roundtripping a font to XML and back + to binary might recover some misencoded data whereas just loading the font + and saving it back will not change them. """ - return tounicode(self.string, encoding=self.getEncoding(), errors=errors) + encoding = self.getEncoding() + string = self.string + if encoding == 'utf_16be' and len(string) % 2 == 1: + # Recover badly encoded UTF-16 strings that have an odd number of bytes: + # - If the last byte is zero, drop it. Otherwise, + # - If all the odd bytes are zero and all the even bytes are ASCII, + # prepend one zero byte. Otherwise, + # - If first byte is zero and all other bytes are ASCII, insert zero + # bytes between consecutive ASCII bytes. + # + # (Yes, I've seen all of these in the wild... sigh) + if byteord(string[-1]) == 0: + string = string[:-1] + elif all(byteord(b) == 0 if i % 2 else byteord(b) >= 0x20 for i,b in enumerate(string)): + string = b'\0' + string + elif byteord(string[0]) == 0 and all(byteord(b) >= 0x20 for b in string[1:]): + string = bytesjoin(b'\0'+bytechr(byteord(b)) for b in string[1:]) + return tounicode(string, encoding=encoding, errors=errors) def toBytes(self, errors='strict'): """ If self.string is a bytes object, return it; otherwise try encoding diff --git a/Lib/fontTools/ttLib/tables/_n_a_m_e_test.py b/Lib/fontTools/ttLib/tables/_n_a_m_e_test.py index 93772dae9..c8b830d6f 100644 --- a/Lib/fontTools/ttLib/tables/_n_a_m_e_test.py +++ b/Lib/fontTools/ttLib/tables/_n_a_m_e_test.py @@ -28,9 +28,8 @@ class NameRecordTest(unittest.TestCase): self.assertEqual("Foo Italic"+unichr(0x02DA), name.toUnicode()) def test_toUnicode_UnicodeDecodeError(self): - name = self.makeName("Foo Bold", 111, 0, 2, 7) + name = self.makeName(b"\1", 111, 0, 2, 7) self.assertEqual("utf_16be", name.getEncoding()) - name.string = b"X" # invalid utf_16be sequence self.assertRaises(UnicodeDecodeError, name.toUnicode) def toXML(self, name): @@ -47,11 +46,19 @@ class NameRecordTest(unittest.TestCase): '' ], self.toXML(name)) - def test_toXML_utf16be_broken(self): + def test_toXML_utf16be_odd_length1(self): name = self.makeName(b"\0F\0o\0o\0", 111, 0, 2, 7) self.assertEqual([ - '', - ' �F�o�o�', + '', + ' Foo', + '' + ], self.toXML(name)) + + def test_toXML_utf16be_odd_length2(self): + name = self.makeName(b"\0Fooz", 111, 0, 2, 7) + self.assertEqual([ + '', + ' Fooz', '' ], self.toXML(name))