Merge pull request #2137 from fonttools/x-mac-encodings-errors

codecs: handle errors different from 'strict' for extended mac encodings
This commit is contained in:
Cosimo Lupo 2021-01-20 11:21:43 +00:00 committed by GitHub
commit 81fa5b5265
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 28 deletions

View File

@ -16,43 +16,29 @@ class ExtendCodec(codecs.Codec):
self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode)
codecs.register_error(name, self.error)
def encode(self, input, errors='strict'):
assert errors == 'strict'
#return codecs.encode(input, self.base_encoding, self.name), len(input)
# The above line could totally be all we needed, relying on the error
# handling to replace the unencodable Unicode characters with our extended
# byte sequences.
#
# However, there seems to be a design bug in Python (probably intentional):
# the error handler for encoding is supposed to return a **Unicode** character,
# that then needs to be encodable itself... Ugh.
#
# So we implement what codecs.encode() should have been doing: which is expect
# error handler to return bytes() to be added to the output.
#
# This seems to have been fixed in Python 3.3. We should try using that and
# use fallback only if that failed.
# https://docs.python.org/3.3/library/codecs.html#codecs.register_error
def _map(self, mapper, output_type, exc_type, input, errors):
base_error_handler = codecs.lookup_error(errors)
length = len(input)
out = b''
out = output_type()
while input:
# first try to use self.error as the error handler
try:
part = codecs.encode(input, self.base_encoding)
part = mapper(input, self.base_encoding, errors=self.name)
out += part
input = '' # All converted
except UnicodeEncodeError as e:
# Convert the correct part
out += codecs.encode(input[:e.start], self.base_encoding)
replacement, pos = self.error(e)
break # All converted
except exc_type as e:
# else convert the correct part, handle error as requested and continue
out += mapper(input[:e.start], self.base_encoding, self.name)
replacement, pos = base_error_handler(e)
out += replacement
input = input[pos:]
return out, length
def encode(self, input, errors='strict'):
return self._map(codecs.encode, bytes, UnicodeEncodeError, input, errors)
def decode(self, input, errors='strict'):
assert errors == 'strict'
return codecs.decode(input, self.base_encoding, self.name), len(input)
return self._map(codecs.decode, str, UnicodeDecodeError, input, errors)
def error(self, e):
if isinstance(e, UnicodeDecodeError):

View File

@ -432,6 +432,18 @@ class NameRecordTest(unittest.TestCase):
name = makeName(b'\xfe', 123, 1, 1, 0) # Mac Japanese
self.assertEqual(name.toUnicode(), unichr(0x2122))
def test_extended_mac_encodings_errors(self):
s = "汉仪彩云体简"
name = makeName(s.encode("x_mac_simp_chinese_ttx"), 123, 1, 25, 0)
# first check we round-trip with 'strict'
self.assertEqual(name.toUnicode(errors="strict"), s)
# append an incomplete invalid sequence and check that we handle
# errors with the requested error handler
name.string += b"\xba"
self.assertEqual(name.toUnicode(errors="backslashreplace"), s + "\\xba")
self.assertEqual(name.toUnicode(errors="replace"), s + "<EFBFBD>")
def test_extended_unknown(self):
name = makeName(b'\xfe', 123, 10, 11, 12)
self.assertEqual(name.getEncoding(), "ascii")