codecs: handle errors different from 'strict' for extended mac encodings
Fixes #2132
This commit is contained in:
parent
70958dca86
commit
d8c42ef7f0
@ -16,43 +16,29 @@ class ExtendCodec(codecs.Codec):
|
|||||||
self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode)
|
self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode)
|
||||||
codecs.register_error(name, self.error)
|
codecs.register_error(name, self.error)
|
||||||
|
|
||||||
def encode(self, input, errors='strict'):
|
def _map(self, mapper, output_type, exc_type, input, errors):
|
||||||
assert errors == 'strict'
|
base_error_handler = codecs.lookup_error(errors)
|
||||||
#return codecs.encode(input, self.base_encoding, self.name), len(input)
|
|
||||||
|
|
||||||
# The above line could totally be all we needed, relying on the error
|
|
||||||
# handling to replace the unencodable Unicode characters with our extended
|
|
||||||
# byte sequences.
|
|
||||||
#
|
|
||||||
# However, there seems to be a design bug in Python (probably intentional):
|
|
||||||
# the error handler for encoding is supposed to return a **Unicode** character,
|
|
||||||
# that then needs to be encodable itself... Ugh.
|
|
||||||
#
|
|
||||||
# So we implement what codecs.encode() should have been doing: which is expect
|
|
||||||
# error handler to return bytes() to be added to the output.
|
|
||||||
#
|
|
||||||
# This seems to have been fixed in Python 3.3. We should try using that and
|
|
||||||
# use fallback only if that failed.
|
|
||||||
# https://docs.python.org/3.3/library/codecs.html#codecs.register_error
|
|
||||||
|
|
||||||
length = len(input)
|
length = len(input)
|
||||||
out = b''
|
out = output_type()
|
||||||
while input:
|
while input:
|
||||||
|
# first try to use self.error as the error handler
|
||||||
try:
|
try:
|
||||||
part = codecs.encode(input, self.base_encoding)
|
part = mapper(input, self.base_encoding, errors=self.name)
|
||||||
out += part
|
out += part
|
||||||
input = '' # All converted
|
break # All converted
|
||||||
except UnicodeEncodeError as e:
|
except exc_type as e:
|
||||||
# Convert the correct part
|
# else convert the correct part, handle error as requested and continue
|
||||||
out += codecs.encode(input[:e.start], self.base_encoding)
|
out += mapper(input[:e.start], self.base_encoding, self.name)
|
||||||
replacement, pos = self.error(e)
|
replacement, pos = base_error_handler(e)
|
||||||
out += replacement
|
out += replacement
|
||||||
input = input[pos:]
|
input = input[pos:]
|
||||||
return out, length
|
return out, length
|
||||||
|
|
||||||
|
def encode(self, input, errors='strict'):
|
||||||
|
return self._map(codecs.encode, bytes, UnicodeEncodeError, input, errors)
|
||||||
|
|
||||||
def decode(self, input, errors='strict'):
|
def decode(self, input, errors='strict'):
|
||||||
assert errors == 'strict'
|
return self._map(codecs.decode, str, UnicodeDecodeError, input, errors)
|
||||||
return codecs.decode(input, self.base_encoding, self.name), len(input)
|
|
||||||
|
|
||||||
def error(self, e):
|
def error(self, e):
|
||||||
if isinstance(e, UnicodeDecodeError):
|
if isinstance(e, UnicodeDecodeError):
|
||||||
|
@ -432,6 +432,18 @@ class NameRecordTest(unittest.TestCase):
|
|||||||
name = makeName(b'\xfe', 123, 1, 1, 0) # Mac Japanese
|
name = makeName(b'\xfe', 123, 1, 1, 0) # Mac Japanese
|
||||||
self.assertEqual(name.toUnicode(), unichr(0x2122))
|
self.assertEqual(name.toUnicode(), unichr(0x2122))
|
||||||
|
|
||||||
|
def test_extended_mac_encodings_errors(self):
|
||||||
|
s = "汉仪彩云体简"
|
||||||
|
name = makeName(s.encode("x_mac_simp_chinese_ttx"), 123, 1, 25, 0)
|
||||||
|
# first check we round-trip with 'strict'
|
||||||
|
self.assertEqual(name.toUnicode(errors="strict"), s)
|
||||||
|
|
||||||
|
# append an incomplete invalid sequence and check that we handle
|
||||||
|
# errors with the requested error handler
|
||||||
|
name.string += b"\xba"
|
||||||
|
self.assertEqual(name.toUnicode(errors="backslashreplace"), s + "\\xba")
|
||||||
|
self.assertEqual(name.toUnicode(errors="replace"), s + "<EFBFBD>")
|
||||||
|
|
||||||
def test_extended_unknown(self):
|
def test_extended_unknown(self):
|
||||||
name = makeName(b'\xfe', 123, 10, 11, 12)
|
name = makeName(b'\xfe', 123, 10, 11, 12)
|
||||||
self.assertEqual(name.getEncoding(), "ascii")
|
self.assertEqual(name.getEncoding(), "ascii")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user