Implement heuristics for decoding odd-length UTF-16BE data in name table
Part of https://github.com/behdad/fonttools/issues/249
This commit is contained in:
parent
abf7dc6568
commit
7e4092aba8
@ -121,11 +121,36 @@ class NameRecord(object):
|
|||||||
entry as returned by self.getEncoding(); Note that self.getEncoding()
|
entry as returned by self.getEncoding(); Note that self.getEncoding()
|
||||||
returns 'ascii' if the encoding is unknown to the library.
|
returns 'ascii' if the encoding is unknown to the library.
|
||||||
|
|
||||||
If the bytes are ill-formed in that chosen encoding, the error is handled
|
Certain heuristics are performed to recover data from bytes that are
|
||||||
according to the errors parameter to this function, which is passed to the
|
ill-formed in the chosen encoding, or that otherwise look misencoded
|
||||||
underlying decode() function; by default it throws a UnicodeDecodeError exception.
|
(mostly around bad UTF-16BE encoded bytes, or bytes that look like UTF-16BE
|
||||||
|
but marked otherwise). If the bytes are ill-formed and the heuristics fail,
|
||||||
|
the error is handled according to the errors parameter to this function, which is
|
||||||
|
passed to the underlying decode() function; by default it throws a
|
||||||
|
UnicodeDecodeError exception.
|
||||||
|
|
||||||
|
Note: The mentioned heuristics mean that roundtripping a font to XML and back
|
||||||
|
to binary might recover some misencoded data whereas just loading the font
|
||||||
|
and saving it back will not change them.
|
||||||
"""
|
"""
|
||||||
return tounicode(self.string, encoding=self.getEncoding(), errors=errors)
|
encoding = self.getEncoding()
|
||||||
|
string = self.string
|
||||||
|
if encoding == 'utf_16be' and len(string) % 2 == 1:
|
||||||
|
# Recover badly encoded UTF-16 strings that have an odd number of bytes:
|
||||||
|
# - If the last byte is zero, drop it. Otherwise,
|
||||||
|
# - If all the odd bytes are zero and all the even bytes are ASCII,
|
||||||
|
# prepend one zero byte. Otherwise,
|
||||||
|
# - If first byte is zero and all other bytes are ASCII, insert zero
|
||||||
|
# bytes between consecutive ASCII bytes.
|
||||||
|
#
|
||||||
|
# (Yes, I've seen all of these in the wild... sigh)
|
||||||
|
if byteord(string[-1]) == 0:
|
||||||
|
string = string[:-1]
|
||||||
|
elif all(byteord(b) == 0 if i % 2 else byteord(b) >= 0x20 for i,b in enumerate(string)):
|
||||||
|
string = b'\0' + string
|
||||||
|
elif byteord(string[0]) == 0 and all(byteord(b) >= 0x20 for b in string[1:]):
|
||||||
|
string = bytesjoin(b'\0'+bytechr(byteord(b)) for b in string[1:])
|
||||||
|
return tounicode(string, encoding=encoding, errors=errors)
|
||||||
|
|
||||||
def toBytes(self, errors='strict'):
|
def toBytes(self, errors='strict'):
|
||||||
""" If self.string is a bytes object, return it; otherwise try encoding
|
""" If self.string is a bytes object, return it; otherwise try encoding
|
||||||
|
@ -28,9 +28,8 @@ class NameRecordTest(unittest.TestCase):
|
|||||||
self.assertEqual("Foo Italic"+unichr(0x02DA), name.toUnicode())
|
self.assertEqual("Foo Italic"+unichr(0x02DA), name.toUnicode())
|
||||||
|
|
||||||
def test_toUnicode_UnicodeDecodeError(self):
|
def test_toUnicode_UnicodeDecodeError(self):
|
||||||
name = self.makeName("Foo Bold", 111, 0, 2, 7)
|
name = self.makeName(b"\1", 111, 0, 2, 7)
|
||||||
self.assertEqual("utf_16be", name.getEncoding())
|
self.assertEqual("utf_16be", name.getEncoding())
|
||||||
name.string = b"X" # invalid utf_16be sequence
|
|
||||||
self.assertRaises(UnicodeDecodeError, name.toUnicode)
|
self.assertRaises(UnicodeDecodeError, name.toUnicode)
|
||||||
|
|
||||||
def toXML(self, name):
|
def toXML(self, name):
|
||||||
@ -47,11 +46,19 @@ class NameRecordTest(unittest.TestCase):
|
|||||||
'</namerecord>'
|
'</namerecord>'
|
||||||
], self.toXML(name))
|
], self.toXML(name))
|
||||||
|
|
||||||
def test_toXML_utf16be_broken(self):
|
def test_toXML_utf16be_odd_length1(self):
|
||||||
name = self.makeName(b"\0F\0o\0o\0", 111, 0, 2, 7)
|
name = self.makeName(b"\0F\0o\0o\0", 111, 0, 2, 7)
|
||||||
self.assertEqual([
|
self.assertEqual([
|
||||||
'<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7" unicode="False">',
|
'<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7">',
|
||||||
' �F�o�o�',
|
' Foo',
|
||||||
|
'</namerecord>'
|
||||||
|
], self.toXML(name))
|
||||||
|
|
||||||
|
def test_toXML_utf16be_odd_length2(self):
|
||||||
|
name = self.makeName(b"\0Fooz", 111, 0, 2, 7)
|
||||||
|
self.assertEqual([
|
||||||
|
'<namerecord nameID="111" platformID="0" platEncID="2" langID="0x7">',
|
||||||
|
' Fooz',
|
||||||
'</namerecord>'
|
'</namerecord>'
|
||||||
], self.toXML(name))
|
], self.toXML(name))
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user