Merge pull request #2137 from fonttools/x-mac-encodings-errors

codecs: handle errors different from 'strict' for extended mac encodings
2021-01-20 11:21:43 +00:00 · 2021-01-20 11:21:43 +00:00 · 81fa5b5265
commit 81fa5b5265
parent c9c30fa51d d8c42ef7f0
2 changed files with 26 additions and 28 deletions
--- a/Lib/fontTools/encodings/codecs.py
+++ b/Lib/fontTools/encodings/codecs.py
@ -16,43 +16,29 @@ class ExtendCodec(codecs.Codec):
 		self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode)
 		codecs.register_error(name, self.error)

-	def encode(self, input, errors='strict'):
-		assert errors == 'strict'
-		#return codecs.encode(input, self.base_encoding, self.name), len(input)
-
-		# The above line could totally be all we needed, relying on the error
-		# handling to replace the unencodable Unicode characters with our extended
-		# byte sequences.
-		#
-		# However, there seems to be a design bug in Python (probably intentional):
-		# the error handler for encoding is supposed to return a **Unicode** character,
-		# that then needs to be encodable itself...  Ugh.
-		#
-		# So we implement what codecs.encode() should have been doing: which is expect
-		# error handler to return bytes() to be added to the output.
-		#
-		# This seems to have been fixed in Python 3.3.  We should try using that and
-		# use fallback only if that failed.
-		# https://docs.python.org/3.3/library/codecs.html#codecs.register_error
-
+	def _map(self, mapper, output_type, exc_type, input, errors):
+		base_error_handler = codecs.lookup_error(errors)
 		length = len(input)
-		out = b''
+		out = output_type()
 		while input:
+			# first try to use self.error as the error handler
 			try:
-				part = codecs.encode(input, self.base_encoding)
+				part = mapper(input, self.base_encoding, errors=self.name)
 				out += part
-				input = '' # All converted
-			except UnicodeEncodeError as e:
-				# Convert the correct part
-				out += codecs.encode(input[:e.start], self.base_encoding)
-				replacement, pos = self.error(e)
+				break  # All converted
+			except exc_type as e:
+				# else convert the correct part, handle error as requested and continue
+				out += mapper(input[:e.start], self.base_encoding, self.name)
+				replacement, pos = base_error_handler(e)
 				out += replacement
 				input = input[pos:]
 		return out, length

+	def encode(self, input, errors='strict'):
+		return self._map(codecs.encode, bytes, UnicodeEncodeError, input, errors)
+
 	def decode(self, input, errors='strict'):
-		assert errors == 'strict'
-		return codecs.decode(input, self.base_encoding, self.name), len(input)
+		return self._map(codecs.decode, str, UnicodeDecodeError, input, errors)

 	def error(self, e):
 		if isinstance(e, UnicodeDecodeError):
--- a/Tests/ttLib/tables/_n_a_m_e_test.py
+++ b/Tests/ttLib/tables/_n_a_m_e_test.py
@ -432,6 +432,18 @@ class NameRecordTest(unittest.TestCase):
 		name = makeName(b'\xfe', 123, 1, 1, 0) # Mac Japanese
 		self.assertEqual(name.toUnicode(), unichr(0x2122))

+	def test_extended_mac_encodings_errors(self):
+		s = "汉仪彩云体简"
+		name = makeName(s.encode("x_mac_simp_chinese_ttx"), 123, 1, 25, 0)
+		# first check we round-trip with 'strict'
+		self.assertEqual(name.toUnicode(errors="strict"), s)
+
+		# append an incomplete invalid sequence and check that we handle
+		# errors with the requested error handler
+		name.string += b"\xba"
+		self.assertEqual(name.toUnicode(errors="backslashreplace"), s + "\\xba")
+		self.assertEqual(name.toUnicode(errors="replace"), s + "<EFBFBD>")
+
 	def test_extended_unknown(self):
 		name = makeName(b'\xfe', 123, 10, 11, 12)
 		self.assertEqual(name.getEncoding(), "ascii")