Implement heuristics for decoding odd-length UTF-16BE data in name table

Part of https://github.com/behdad/fonttools/issues/249
This commit is contained in:
Behdad Esfahbod 2015-04-22 01:49:15 -07:00
parent abf7dc6568
commit 7e4092aba8
2 changed files with 41 additions and 9 deletions

View File

@ -121,11 +121,36 @@ class NameRecord(object):
entry as returned by self.getEncoding(); Note that self.getEncoding() entry as returned by self.getEncoding(); Note that self.getEncoding()
returns 'ascii' if the encoding is unknown to the library. returns 'ascii' if the encoding is unknown to the library.
If the bytes are ill-formed in that chosen encoding, the error is handled Certain heuristics are performed to recover data from bytes that are
according to the errors parameter to this function, which is passed to the ill-formed in the chosen encoding, or that otherwise look misencoded
underlying decode() function; by default it throws a UnicodeDecodeError exception. (mostly around bad UTF-16BE encoded bytes, or bytes that look like UTF-16BE
but marked otherwise). If the bytes are ill-formed and the heuristics fail,
the error is handled according to the errors parameter to this function, which is
passed to the underlying decode() function; by default it throws a
UnicodeDecodeError exception.
Note: The mentioned heuristics mean that roundtripping a font to XML and back
to binary might recover some misencoded data whereas just loading the font
and saving it back will not change them.
""" """
return tounicode(self.string, encoding=self.getEncoding(), errors=errors) encoding = self.getEncoding()
string = self.string
if encoding == 'utf_16be' and len(string) % 2 == 1:
# Recover badly encoded UTF-16 strings that have an odd number of bytes:
# - If the last byte is zero, drop it. Otherwise,
# - If all the odd bytes are zero and all the even bytes are ASCII,
# prepend one zero byte. Otherwise,
# - If first byte is zero and all other bytes are ASCII, insert zero
# bytes between consecutive ASCII bytes.
#
# (Yes, I've seen all of these in the wild... sigh)
if byteord(string[-1]) == 0:
string = string[:-1]
elif all(byteord(b) == 0 if i % 2 else byteord(b) >= 0x20 for i,b in enumerate(string)):
string = b'\0' + string
elif byteord(string[0]) == 0 and all(byteord(b) >= 0x20 for b in string[1:]):
string = bytesjoin(b'\0'+bytechr(byteord(b)) for b in string[1:])
return tounicode(string, encoding=encoding, errors=errors)
def toBytes(self, errors='strict'): def toBytes(self, errors='strict'):
""" If self.string is a bytes object, return it; otherwise try encoding """ If self.string is a bytes object, return it; otherwise try encoding

View File

@ -28,9 +28,8 @@ class NameRecordTest(unittest.TestCase):
self.assertEqual("Foo Italic"+unichr(0x02DA), name.toUnicode()) self.assertEqual("Foo Italic"+unichr(0x02DA), name.toUnicode())
def test_toUnicode_UnicodeDecodeError(self): def test_toUnicode_UnicodeDecodeError(self):
name = self.makeName("Foo Bold", 111, 0, 2, 7) name = self.makeName(b"\1", 111, 0, 2, 7)
self.assertEqual("utf_16be", name.getEncoding()) self.assertEqual("utf_16be", name.getEncoding())
name.string = b"X" # invalid utf_16be sequence
self.assertRaises(UnicodeDecodeError, name.toUnicode) self.assertRaises(UnicodeDecodeError, name.toUnicode)
def toXML(self, name): def toXML(self, name):
@ -47,11 +46,19 @@ class NameRecordTest(unittest.TestCase):
'</namerecord>' '</namerecord>'
], self.toXML(name)) ], self.toXML(name))
def test_toXML_utf16be_broken(self): def test_toXML_utf16be_odd_length1(self):
name = self.makeName(b"\0F\0o\0o\0", 111, 0, 2, 7) name = self.makeName(b"\0F\0o\0o\0", 111, 0, 2, 7)
self.assertEqual([ self.assertEqual([
'<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7" unicode="False">', '<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7">',
' &#0;F&#0;o&#0;o&#0;', ' Foo',
'</namerecord>'
], self.toXML(name))
def test_toXML_utf16be_odd_length2(self):
name = self.makeName(b"\0Fooz", 111, 0, 2, 7)
self.assertEqual([
'<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7">',
' Fooz',
'</namerecord>' '</namerecord>'
], self.toXML(name)) ], self.toXML(name))