codecs: handle errors different from 'strict' for extended mac encodings

Fixes #2132
2020-12-24 20:37:52 +01:00 · 2020-12-24 20:37:52 +01:00 · d8c42ef7f0
commit d8c42ef7f0
parent 70958dca86
2 changed files with 26 additions and 28 deletions
--- a/Lib/fontTools/encodings/codecs.py
+++ b/Lib/fontTools/encodings/codecs.py
@ -16,43 +16,29 @@ class ExtendCodec(codecs.Codec):
 		self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode)
 		codecs.register_error(name, self.error)
-	def encode(self, input, errors='strict'):
+	def _map(self, mapper, output_type, exc_type, input, errors):
-		assert errors == 'strict'
+		base_error_handler = codecs.lookup_error(errors)
 		#return codecs.encode(input, self.base_encoding, self.name), len(input)
 		# The above line could totally be all we needed, relying on the error
 		# handling to replace the unencodable Unicode characters with our extended
 		# byte sequences.
 		#
 		# However, there seems to be a design bug in Python (probably intentional):
 		# the error handler for encoding is supposed to return a **Unicode** character,
 		# that then needs to be encodable itself...  Ugh.
 		#
 		# So we implement what codecs.encode() should have been doing: which is expect
 		# error handler to return bytes() to be added to the output.
 		#
 		# This seems to have been fixed in Python 3.3.  We should try using that and
 		# use fallback only if that failed.
 		# https://docs.python.org/3.3/library/codecs.html#codecs.register_error
 		length = len(input)
-		out = b''
+		out = output_type()
 		while input:
 			# first try to use self.error as the error handler
 			try:
-				part = codecs.encode(input, self.base_encoding)
+				part = mapper(input, self.base_encoding, errors=self.name)
 				out += part
-				input = '' # All converted
+				break  # All converted
-			except UnicodeEncodeError as e:
+			except exc_type as e:
-				# Convert the correct part
+				# else convert the correct part, handle error as requested and continue
-				out += codecs.encode(input[:e.start], self.base_encoding)
+				out += mapper(input[:e.start], self.base_encoding, self.name)
-				replacement, pos = self.error(e)
+				replacement, pos = base_error_handler(e)
 				out += replacement
 				input = input[pos:]
 		return out, length
 	def encode(self, input, errors='strict'):
 		return self._map(codecs.encode, bytes, UnicodeEncodeError, input, errors)
 	def decode(self, input, errors='strict'):
-		assert errors == 'strict'
+		return self._map(codecs.decode, str, UnicodeDecodeError, input, errors)
 		return codecs.decode(input, self.base_encoding, self.name), len(input)
 	def error(self, e):
 		if isinstance(e, UnicodeDecodeError):
--- a/Tests/ttLib/tables/_n_a_m_e_test.py
+++ b/Tests/ttLib/tables/_n_a_m_e_test.py
@ -432,6 +432,18 @@ class NameRecordTest(unittest.TestCase):
 		name = makeName(b'\xfe', 123, 1, 1, 0) # Mac Japanese
 		self.assertEqual(name.toUnicode(), unichr(0x2122))
 	def test_extended_mac_encodings_errors(self):
 		s = "汉仪彩云体简"
 		name = makeName(s.encode("x_mac_simp_chinese_ttx"), 123, 1, 25, 0)
 		# first check we round-trip with 'strict'
 		self.assertEqual(name.toUnicode(errors="strict"), s)
 		# append an incomplete invalid sequence and check that we handle
 		# errors with the requested error handler
 		name.string += b"\xba"
 		self.assertEqual(name.toUnicode(errors="backslashreplace"), s + "\\xba")
 		self.assertEqual(name.toUnicode(errors="replace"), s + "<EFBFBD>")
 	def test_extended_unknown(self):
 		name = makeName(b'\xfe', 123, 10, 11, 12)
 		self.assertEqual(name.getEncoding(), "ascii")