From d8c42ef7f013b382248b91869aabf232d90c14e0 Mon Sep 17 00:00:00 2001 From: Cosimo Lupo Date: Thu, 24 Dec 2020 20:37:52 +0100 Subject: [PATCH] codecs: handle errors different from 'strict' for extended mac encodings Fixes #2132 --- Lib/fontTools/encodings/codecs.py | 42 ++++++++++------------------- Tests/ttLib/tables/_n_a_m_e_test.py | 12 +++++++++ 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/Lib/fontTools/encodings/codecs.py b/Lib/fontTools/encodings/codecs.py index ac2b99094..c2288a777 100644 --- a/Lib/fontTools/encodings/codecs.py +++ b/Lib/fontTools/encodings/codecs.py @@ -16,43 +16,29 @@ class ExtendCodec(codecs.Codec): self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode) codecs.register_error(name, self.error) - def encode(self, input, errors='strict'): - assert errors == 'strict' - #return codecs.encode(input, self.base_encoding, self.name), len(input) - - # The above line could totally be all we needed, relying on the error - # handling to replace the unencodable Unicode characters with our extended - # byte sequences. - # - # However, there seems to be a design bug in Python (probably intentional): - # the error handler for encoding is supposed to return a **Unicode** character, - # that then needs to be encodable itself... Ugh. - # - # So we implement what codecs.encode() should have been doing: which is expect - # error handler to return bytes() to be added to the output. - # - # This seems to have been fixed in Python 3.3. We should try using that and - # use fallback only if that failed. - # https://docs.python.org/3.3/library/codecs.html#codecs.register_error - + def _map(self, mapper, output_type, exc_type, input, errors): + base_error_handler = codecs.lookup_error(errors) length = len(input) - out = b'' + out = output_type() while input: + # first try to use self.error as the error handler try: - part = codecs.encode(input, self.base_encoding) + part = mapper(input, self.base_encoding, errors=self.name) out += part - input = '' # All converted - except UnicodeEncodeError as e: - # Convert the correct part - out += codecs.encode(input[:e.start], self.base_encoding) - replacement, pos = self.error(e) + break # All converted + except exc_type as e: + # else convert the correct part, handle error as requested and continue + out += mapper(input[:e.start], self.base_encoding, self.name) + replacement, pos = base_error_handler(e) out += replacement input = input[pos:] return out, length + def encode(self, input, errors='strict'): + return self._map(codecs.encode, bytes, UnicodeEncodeError, input, errors) + def decode(self, input, errors='strict'): - assert errors == 'strict' - return codecs.decode(input, self.base_encoding, self.name), len(input) + return self._map(codecs.decode, str, UnicodeDecodeError, input, errors) def error(self, e): if isinstance(e, UnicodeDecodeError): diff --git a/Tests/ttLib/tables/_n_a_m_e_test.py b/Tests/ttLib/tables/_n_a_m_e_test.py index bc4aab2f1..11aeebae9 100644 --- a/Tests/ttLib/tables/_n_a_m_e_test.py +++ b/Tests/ttLib/tables/_n_a_m_e_test.py @@ -432,6 +432,18 @@ class NameRecordTest(unittest.TestCase): name = makeName(b'\xfe', 123, 1, 1, 0) # Mac Japanese self.assertEqual(name.toUnicode(), unichr(0x2122)) + def test_extended_mac_encodings_errors(self): + s = "汉仪彩云体简" + name = makeName(s.encode("x_mac_simp_chinese_ttx"), 123, 1, 25, 0) + # first check we round-trip with 'strict' + self.assertEqual(name.toUnicode(errors="strict"), s) + + # append an incomplete invalid sequence and check that we handle + # errors with the requested error handler + name.string += b"\xba" + self.assertEqual(name.toUnicode(errors="backslashreplace"), s + "\\xba") + self.assertEqual(name.toUnicode(errors="replace"), s + "�") + def test_extended_unknown(self): name = makeName(b'\xfe', 123, 10, 11, 12) self.assertEqual(name.getEncoding(), "ascii")