Revamp name table Unicode handling some more
Part of https://github.com/behdad/fonttools/issues/236 Now we fallback to ASCII for unknown encodings. Not sure if this might be a bad idea. The main user-visible difference is that if there's an ASCII-only text in an unknown encoding, we still "decode" it and use unicode="True" instead of unicode="False". Or is assuming that any unsupported encoding is ASCII-compatible too intrusive?
This commit is contained in:
parent
f419f572db
commit
ba0a3b9abb
@ -31,14 +31,14 @@ except ImportError:
|
||||
def strjoin(iterable, joiner=''):
|
||||
return tostr(joiner).join(iterable)
|
||||
|
||||
def tobytes(s, encoding='ascii'):
|
||||
def tobytes(s, encoding='ascii', errors='strict'):
|
||||
if not isinstance(s, bytes):
|
||||
return s.encode(encoding)
|
||||
return s.encode(encoding, errors)
|
||||
else:
|
||||
return s
|
||||
def tounicode(s, encoding='ascii'):
|
||||
def tounicode(s, encoding='ascii', errors='strict'):
|
||||
if not isinstance(s, unicode):
|
||||
return s.decode(encoding)
|
||||
return s.decode(encoding, errors)
|
||||
else:
|
||||
return s
|
||||
|
||||
|
@ -150,37 +150,61 @@ class NameRecord(object):
|
||||
},
|
||||
}
|
||||
|
||||
def getEncoding(self):
|
||||
encoding = self._encodingMap.get(self.platformID, {}).get(self.platEncID, None)
|
||||
def getEncoding(self, default='ascii'):
|
||||
"""Returns the Python encoding name for this name entry based on its platformID,
|
||||
platEncID, and langID. If encoding for these values is not known, by default
|
||||
'ascii' is returned. That can be overriden by passing a value to the default
|
||||
argument.
|
||||
"""
|
||||
encoding = self._encodingMap.get(self.platformID, {}).get(self.platEncID, default)
|
||||
if isinstance(encoding, dict):
|
||||
encoding = encoding.get(self.langID, encoding[Ellipsis])
|
||||
return encoding
|
||||
|
||||
def encodingIsUnicodeCompatible(self):
|
||||
return self.getEncoding() in ['utf-16be', 'ucs2be', 'ascii', 'latin1']
|
||||
return self.getEncoding(None) in ['utf-16be', 'ucs2be', 'ascii', 'latin1']
|
||||
|
||||
def __str__(self):
|
||||
unistr = self.toUnicode()
|
||||
if unistr != None:
|
||||
return unistr
|
||||
else:
|
||||
try:
|
||||
return self.toUnicode()
|
||||
except UnicodeDecodeError:
|
||||
return str(self.string)
|
||||
|
||||
def isUnicode(self):
|
||||
return (self.platformID == 0 or
|
||||
(self.platformID == 3 and self.platEncID in [0, 1, 10]))
|
||||
|
||||
def toUnicode(self):
|
||||
encoding = self.getEncoding()
|
||||
if encoding == None:
|
||||
return None
|
||||
return tounicode(self.string, encoding=encoding)
|
||||
def toUnicode(self, errors='strict'):
|
||||
"""
|
||||
If self.string is a Unicode string, return it; otherwise try decoding the
|
||||
bytes in self.string to a Unicode string using the encoding of this
|
||||
entry as returned by self.getEncoding(); Note that self.getEncoding()
|
||||
returns 'ascii' if the encoding is unknown to the library.
|
||||
|
||||
def toBytes(self):
|
||||
return tobytes(self.string, encoding=self.getEncoding())
|
||||
If the bytes are ill-formed in that chosen encoding, the error is handled
|
||||
according to the errors parameter to this function, which is passed to the
|
||||
underlying decode() function; by default it throws a UnicodeDecodeError exception.
|
||||
"""
|
||||
return tounicode(self.string, encoding=self.getEncoding(), errors=errors)
|
||||
|
||||
def toBytes(self, errors='strict'):
|
||||
""" If self.string is a bytes object, return it; otherwise try encoding
|
||||
the Unicode string in self.string to bytes using the encoding of this
|
||||
entry as returned by self.getEncoding(); Note that self.getEncoding()
|
||||
returns 'ascii' if the encoding is unknown to the library.
|
||||
|
||||
If the Unicode string cannot be encoded to bytes in the chosen encoding,
|
||||
the error is handled according to the errors parameter to this function,
|
||||
which is passed to the underlying encode() function; by default it throws a
|
||||
UnicodeEncodeError exception.
|
||||
"""
|
||||
return tobytes(self.string, encoding=self.getEncoding(), errors=errors)
|
||||
|
||||
def toXML(self, writer, ttFont):
|
||||
unistr = self.toUnicode()
|
||||
try:
|
||||
unistr = self.toUnicode()
|
||||
except UnicodeDecodeError:
|
||||
unistr = None
|
||||
attrs = [
|
||||
("nameID", self.nameID),
|
||||
("platformID", self.platformID),
|
||||
|
@ -9,7 +9,7 @@ class NameRecordTest(unittest.TestCase):
|
||||
def makeName(self, text, nameID, platformID, platEncID, langID):
|
||||
name = NameRecord()
|
||||
name.nameID, name.platformID, name.platEncID, name.langID = (nameID, platformID, platEncID, langID)
|
||||
name.string = text.encode(name.getEncoding())
|
||||
name.string = tobytes(text, encoding=name.getEncoding())
|
||||
return name
|
||||
|
||||
def test_toUnicode_utf16be(self):
|
||||
@ -19,7 +19,7 @@ class NameRecordTest(unittest.TestCase):
|
||||
|
||||
def test_toUnicode_macroman(self):
|
||||
name = self.makeName("Foo Italic", 222, 1, 0, 7) # MacRoman
|
||||
self.assertEqual("macroman", name.getEncoding())
|
||||
self.assertEqual("mac-roman", name.getEncoding())
|
||||
self.assertEqual("Foo Italic", name.toUnicode())
|
||||
|
||||
def test_toUnicode_UnicodeDecodeError(self):
|
||||
@ -50,19 +50,24 @@ class NameRecordTest(unittest.TestCase):
|
||||
'</namerecord>'
|
||||
], self.toXML(name))
|
||||
|
||||
def test_toXML_unknownPlatEncID(self):
|
||||
name = NameRecord()
|
||||
name.string = b"B\x8arli"
|
||||
name.nameID, name.platformID, name.platEncID, name.langID = (333, 1, 9876, 7)
|
||||
def test_toXML_unknownPlatEncID_nonASCII(self):
|
||||
name = self.makeName(b"B\x8arli", 333, 1, 9876, 7) # Unknown Mac encodingID
|
||||
self.assertEqual([
|
||||
'<namerecord nameID="333" platformID="1" platEncID="9876" langID="0x7" unicode="False">',
|
||||
' BŠrli',
|
||||
'</namerecord>'
|
||||
], self.toXML(name))
|
||||
|
||||
def test_toXML_unknownPlatEncID_ASCII(self):
|
||||
name = self.makeName(b"Barli", 333, 1, 9876, 7) # Unknown Mac encodingID
|
||||
self.assertEqual([
|
||||
'<namerecord nameID="333" platformID="1" platEncID="9876" langID="0x7" unicode="True">',
|
||||
' Barli',
|
||||
'</namerecord>'
|
||||
], self.toXML(name))
|
||||
|
||||
def test_encoding_macroman_misc(self):
|
||||
name = NameRecord()
|
||||
name.nameID, name.platformID, name.platEncID, name.langID = (123, 1, 0, 17)
|
||||
name = self.makeName('', 123, 1, 0, 17) # Mac Turkish
|
||||
self.assertEqual(name.getEncoding(), "mac-turkish")
|
||||
name.langID = 37
|
||||
self.assertEqual(name.getEncoding(), None)
|
||||
@ -70,9 +75,7 @@ class NameRecordTest(unittest.TestCase):
|
||||
self.assertEqual(name.getEncoding(), "mac-roman")
|
||||
|
||||
def test_extended_mac_encodings(self):
|
||||
name = NameRecord()
|
||||
name.string = b"\xfe"
|
||||
name.nameID, name.platformID, name.platEncID, name.langID = (123, 1, 1, 0)
|
||||
name = self.makeName(b'\xfe', 123, 1, 1, 0) # Mac Japanese
|
||||
self.assertEqual(name.toUnicode(), unichr(0x2122))
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Loading…
x
Reference in New Issue
Block a user