Revamp name table Unicode handling some more

Part of https://github.com/behdad/fonttools/issues/236

Now we fallback to ASCII for unknown encodings.  Not sure if this might be a bad idea.
The main user-visible difference is that if there's an ASCII-only text in an unknown
encoding, we still "decode" it and use unicode="True" instead of unicode="False".

Or is assuming that any unsupported encoding is ASCII-compatible too intrusive?
This commit is contained in:
Behdad Esfahbod 2015-04-16 17:09:49 -07:00
parent f419f572db
commit ba0a3b9abb
3 changed files with 57 additions and 30 deletions

View File

@ -31,14 +31,14 @@ except ImportError:
def strjoin(iterable, joiner=''):
return tostr(joiner).join(iterable)
def tobytes(s, encoding='ascii'):
def tobytes(s, encoding='ascii', errors='strict'):
if not isinstance(s, bytes):
return s.encode(encoding)
return s.encode(encoding, errors)
else:
return s
def tounicode(s, encoding='ascii'):
def tounicode(s, encoding='ascii', errors='strict'):
if not isinstance(s, unicode):
return s.decode(encoding)
return s.decode(encoding, errors)
else:
return s

View File

@ -150,37 +150,61 @@ class NameRecord(object):
},
}
def getEncoding(self):
encoding = self._encodingMap.get(self.platformID, {}).get(self.platEncID, None)
def getEncoding(self, default='ascii'):
"""Returns the Python encoding name for this name entry based on its platformID,
platEncID, and langID. If encoding for these values is not known, by default
'ascii' is returned. That can be overriden by passing a value to the default
argument.
"""
encoding = self._encodingMap.get(self.platformID, {}).get(self.platEncID, default)
if isinstance(encoding, dict):
encoding = encoding.get(self.langID, encoding[Ellipsis])
return encoding
def encodingIsUnicodeCompatible(self):
return self.getEncoding() in ['utf-16be', 'ucs2be', 'ascii', 'latin1']
return self.getEncoding(None) in ['utf-16be', 'ucs2be', 'ascii', 'latin1']
def __str__(self):
unistr = self.toUnicode()
if unistr != None:
return unistr
else:
try:
return self.toUnicode()
except UnicodeDecodeError:
return str(self.string)
def isUnicode(self):
return (self.platformID == 0 or
(self.platformID == 3 and self.platEncID in [0, 1, 10]))
def toUnicode(self):
encoding = self.getEncoding()
if encoding == None:
return None
return tounicode(self.string, encoding=encoding)
def toUnicode(self, errors='strict'):
"""
If self.string is a Unicode string, return it; otherwise try decoding the
bytes in self.string to a Unicode string using the encoding of this
entry as returned by self.getEncoding(); Note that self.getEncoding()
returns 'ascii' if the encoding is unknown to the library.
def toBytes(self):
return tobytes(self.string, encoding=self.getEncoding())
If the bytes are ill-formed in that chosen encoding, the error is handled
according to the errors parameter to this function, which is passed to the
underlying decode() function; by default it throws a UnicodeDecodeError exception.
"""
return tounicode(self.string, encoding=self.getEncoding(), errors=errors)
def toBytes(self, errors='strict'):
""" If self.string is a bytes object, return it; otherwise try encoding
the Unicode string in self.string to bytes using the encoding of this
entry as returned by self.getEncoding(); Note that self.getEncoding()
returns 'ascii' if the encoding is unknown to the library.
If the Unicode string cannot be encoded to bytes in the chosen encoding,
the error is handled according to the errors parameter to this function,
which is passed to the underlying encode() function; by default it throws a
UnicodeEncodeError exception.
"""
return tobytes(self.string, encoding=self.getEncoding(), errors=errors)
def toXML(self, writer, ttFont):
unistr = self.toUnicode()
try:
unistr = self.toUnicode()
except UnicodeDecodeError:
unistr = None
attrs = [
("nameID", self.nameID),
("platformID", self.platformID),

View File

@ -9,7 +9,7 @@ class NameRecordTest(unittest.TestCase):
def makeName(self, text, nameID, platformID, platEncID, langID):
name = NameRecord()
name.nameID, name.platformID, name.platEncID, name.langID = (nameID, platformID, platEncID, langID)
name.string = text.encode(name.getEncoding())
name.string = tobytes(text, encoding=name.getEncoding())
return name
def test_toUnicode_utf16be(self):
@ -19,7 +19,7 @@ class NameRecordTest(unittest.TestCase):
def test_toUnicode_macroman(self):
name = self.makeName("Foo Italic", 222, 1, 0, 7) # MacRoman
self.assertEqual("macroman", name.getEncoding())
self.assertEqual("mac-roman", name.getEncoding())
self.assertEqual("Foo Italic", name.toUnicode())
def test_toUnicode_UnicodeDecodeError(self):
@ -50,19 +50,24 @@ class NameRecordTest(unittest.TestCase):
'</namerecord>'
], self.toXML(name))
def test_toXML_unknownPlatEncID(self):
name = NameRecord()
name.string = b"B\x8arli"
name.nameID, name.platformID, name.platEncID, name.langID = (333, 1, 9876, 7)
def test_toXML_unknownPlatEncID_nonASCII(self):
name = self.makeName(b"B\x8arli", 333, 1, 9876, 7) # Unknown Mac encodingID
self.assertEqual([
'<namerecord nameID="333" platformID="1" platEncID="9876" langID="0x7" unicode="False">',
' B&#138;rli',
'</namerecord>'
], self.toXML(name))
def test_toXML_unknownPlatEncID_ASCII(self):
name = self.makeName(b"Barli", 333, 1, 9876, 7) # Unknown Mac encodingID
self.assertEqual([
'<namerecord nameID="333" platformID="1" platEncID="9876" langID="0x7" unicode="True">',
' Barli',
'</namerecord>'
], self.toXML(name))
def test_encoding_macroman_misc(self):
name = NameRecord()
name.nameID, name.platformID, name.platEncID, name.langID = (123, 1, 0, 17)
name = self.makeName('', 123, 1, 0, 17) # Mac Turkish
self.assertEqual(name.getEncoding(), "mac-turkish")
name.langID = 37
self.assertEqual(name.getEncoding(), None)
@ -70,9 +75,7 @@ class NameRecordTest(unittest.TestCase):
self.assertEqual(name.getEncoding(), "mac-roman")
def test_extended_mac_encodings(self):
name = NameRecord()
name.string = b"\xfe"
name.nameID, name.platformID, name.platEncID, name.langID = (123, 1, 1, 0)
name = self.makeName(b'\xfe', 123, 1, 1, 0) # Mac Japanese
self.assertEqual(name.toUnicode(), unichr(0x2122))
if __name__ == "__main__":