Implement heuristics for decoding odd-length UTF-16BE data in name table

Part of https://github.com/behdad/fonttools/issues/249
This commit is contained in:
Behdad Esfahbod 2015-04-22 01:49:15 -07:00
parent abf7dc6568
commit 7e4092aba8
2 changed files with 41 additions and 9 deletions

View File

@ -121,11 +121,36 @@ class NameRecord(object):
entry as returned by self.getEncoding(); Note that self.getEncoding()
returns 'ascii' if the encoding is unknown to the library.
If the bytes are ill-formed in that chosen encoding, the error is handled
according to the errors parameter to this function, which is passed to the
underlying decode() function; by default it throws a UnicodeDecodeError exception.
Certain heuristics are performed to recover data from bytes that are
ill-formed in the chosen encoding, or that otherwise look misencoded
(mostly around bad UTF-16BE encoded bytes, or bytes that look like UTF-16BE
but marked otherwise). If the bytes are ill-formed and the heuristics fail,
the error is handled according to the errors parameter to this function, which is
passed to the underlying decode() function; by default it throws a
UnicodeDecodeError exception.
Note: The mentioned heuristics mean that roundtripping a font to XML and back
to binary might recover some misencoded data whereas just loading the font
and saving it back will not change them.
"""
return tounicode(self.string, encoding=self.getEncoding(), errors=errors)
encoding = self.getEncoding()
string = self.string
if encoding == 'utf_16be' and len(string) % 2 == 1:
# Recover badly encoded UTF-16 strings that have an odd number of bytes:
# - If the last byte is zero, drop it. Otherwise,
# - If all the odd bytes are zero and all the even bytes are ASCII,
# prepend one zero byte. Otherwise,
# - If first byte is zero and all other bytes are ASCII, insert zero
# bytes between consecutive ASCII bytes.
#
# (Yes, I've seen all of these in the wild... sigh)
if byteord(string[-1]) == 0:
string = string[:-1]
elif all(byteord(b) == 0 if i % 2 else byteord(b) >= 0x20 for i,b in enumerate(string)):
string = b'\0' + string
elif byteord(string[0]) == 0 and all(byteord(b) >= 0x20 for b in string[1:]):
string = bytesjoin(b'\0'+bytechr(byteord(b)) for b in string[1:])
return tounicode(string, encoding=encoding, errors=errors)
def toBytes(self, errors='strict'):
""" If self.string is a bytes object, return it; otherwise try encoding

View File

@ -28,9 +28,8 @@ class NameRecordTest(unittest.TestCase):
self.assertEqual("Foo Italic"+unichr(0x02DA), name.toUnicode())
def test_toUnicode_UnicodeDecodeError(self):
name = self.makeName("Foo Bold", 111, 0, 2, 7)
name = self.makeName(b"\1", 111, 0, 2, 7)
self.assertEqual("utf_16be", name.getEncoding())
name.string = b"X" # invalid utf_16be sequence
self.assertRaises(UnicodeDecodeError, name.toUnicode)
def toXML(self, name):
@ -47,11 +46,19 @@ class NameRecordTest(unittest.TestCase):
'</namerecord>'
], self.toXML(name))
def test_toXML_utf16be_broken(self):
def test_toXML_utf16be_odd_length1(self):
name = self.makeName(b"\0F\0o\0o\0", 111, 0, 2, 7)
self.assertEqual([
'<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7" unicode="False">',
' &#0;F&#0;o&#0;o&#0;',
'<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7">',
' Foo',
'</namerecord>'
], self.toXML(name))
def test_toXML_utf16be_odd_length2(self):
name = self.makeName(b"\0Fooz", 111, 0, 2, 7)
self.assertEqual([
'<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7">',
' Fooz',
'</namerecord>'
], self.toXML(name))