fonttools/Lib/fontTools/misc/py23.py

"""Python 2/3 compat layer."""

from __future__ import print_function, division, absolute_import
import sys

try:
	basestring = basestring
except NameError:
	basestring = str

try:
	unicode = unicode
except NameError:
	unicode = str

try:
	unichr = unichr

	if sys.maxunicode < 0x10FFFF:
		# workarounds for Python 2 "narrow" builds with UCS2-only support.

		_narrow_unichr = unichr

		def unichr(i):
			"""
			Return the unicode character whose Unicode code is the integer 'i'.
			The valid range is 0 to 0x10FFFF inclusive.

			>>> _narrow_unichr(0xFFFF + 1)
			Traceback (most recent call last):
			  File "<stdin>", line 1, in ?
			ValueError: unichr() arg not in range(0x10000) (narrow Python build)
			>>> unichr(0xFFFF + 1) == u'\U00010000'
			True
			>>> unichr(1114111) == u'\U0010FFFF'
			True
			>>> unichr(0x10FFFF + 1)
			Traceback (most recent call last):
			  File "<stdin>", line 1, in ?
			ValueError: unichr() arg not in range(0x110000)
			"""
			try:
				return _narrow_unichr(i)
			except ValueError:
				try:
					padded_hex_str = hex(i)[2:].zfill(8)
					escape_str = "\\U" + padded_hex_str
					return escape_str.decode("unicode-escape")
				except UnicodeDecodeError:
					raise ValueError('unichr() arg not in range(0x110000)')

		import re
		_unicode_escape_RE = re.compile(r'\\U[A-Fa-f0-9]{8}')

		def byteord(c):
			"""
			Given a 8-bit or unicode character, return an integer representing the
			Unicode code point of the character. If a unicode argument is given, the
			character's code point must be in the range 0 to 0x10FFFF inclusive.

			>>> ord(u'\U00010000')
			Traceback (most recent call last):
			  File "<stdin>", line 1, in ?
			TypeError: ord() expected a character, but string of length 2 found
			>>> byteord(u'\U00010000') == 0xFFFF + 1
			True
			>>> byteord(u'\U0010FFFF') == 1114111
			True
			"""
			try:
				return ord(c)
			except TypeError as e:
				try:
					escape_str = c.encode('unicode-escape')
					if not _unicode_escape_RE.match(escape_str):
						raise
					hex_str = escape_str[3:]
					return int(hex_str, 16)
				except:
					raise TypeError(e)

	else:
		byteord = ord
	bytechr = chr

except NameError:
	unichr = chr
	def bytechr(n):
		return bytes([n])
	def byteord(c):
		return c if isinstance(c, int) else ord(c)


# the 'io' module provides the same I/O interface on both 2 and 3.
# here we define an alias of io.StringIO to disambiguate it eternally...
from io import BytesIO
from io import StringIO as UnicodeIO
try:
	# in python 2, by 'StringIO' we still mean a stream of *byte* strings
	from StringIO import StringIO
except ImportError:
	# in Python 3, we mean instead a stream of *unicode* strings
	StringIO = UnicodeIO


def strjoin(iterable, joiner=''):
	return tostr(joiner).join(iterable)

def tobytes(s, encoding='ascii', errors='strict'):
	if not isinstance(s, bytes):
		return s.encode(encoding, errors)
	else:
		return s
def tounicode(s, encoding='ascii', errors='strict'):
	if not isinstance(s, unicode):
		return s.decode(encoding, errors)
	else:
		return s

if str == bytes:
	class Tag(str):
		def tobytes(self):
			if isinstance(self, bytes):
				return self
			else:
				return self.encode('latin1')

	tostr = tobytes

	bytesjoin = strjoin
else:
	class Tag(str):

		@staticmethod
		def transcode(blob):
			if not isinstance(blob, str):
				blob = blob.decode('latin-1')
			return blob

		def __new__(self, content):
			return str.__new__(self, self.transcode(content))
		def __ne__(self, other):
			return not self.__eq__(other)
		def __eq__(self, other):
			return str.__eq__(self, self.transcode(other))

		def __hash__(self):
			return str.__hash__(self)

		def tobytes(self):
			return self.encode('latin-1')

	tostr = tounicode

	def bytesjoin(iterable, joiner=b''):
		return tobytes(joiner).join(tobytes(item) for item in iterable)


if __name__ == "__main__":
	import doctest, sys
	sys.exit(doctest.testmod().failed)
Add py23 compat layer 2013-11-27 14:36:57 -05:00			`"""Python 2/3 compat layer."""`

from __future__ import absolute_import Such that our Python 2 is closer to Python 3. Part of https://github.com/behdad/fonttools/issues/77 2014-01-14 15:07:50 +08:00			`from __future__ import print_function, division, absolute_import`
[py23] simulate "wide" unichr and ord on "narrow" UCS2-only Python 2 2015-05-08 19:28:42 +01:00			`import sys`
py23 from __future__ import print_function 2013-11-27 17:27:45 -05:00
Add py23 compat layer 2013-11-27 14:36:57 -05:00			`try:`
[py23] fix ImportError when trying to import `unichr`, `basestring` or `unicode` when already defined When one does `from fontTools.misc.py23 import *`, everything seems to work fine. However, linters will complain when one uses the asterisk to import all names from a module, since they can't detect when names are left undefined -- asterisks are greedy and will eat all names. If one avoids the asterik and attempts to import explicitly, like in `from fontTools.misc.py23 import basestring`, the problem then is that, if `py23` does not re-define the name -- e.g. under python2 `basestring` or `unicode` are built-ins -- then the import statement raises `ImportError`. The same happens for the `unichr` function on a "wide" Python 2 build (in which `sys.maxunicode == 0x10FFFF`). Now, to work around this, we need to re-assign those built-ins to their very same names. This may look silly, but at least it works. 2015-11-23 12:02:12 +00:00			`basestring = basestring`
Add py23 compat layer 2013-11-27 14:36:57 -05:00			`except NameError:`
			`basestring = str`

			`try:`
[py23] fix ImportError when trying to import `unichr`, `basestring` or `unicode` when already defined When one does `from fontTools.misc.py23 import *`, everything seems to work fine. However, linters will complain when one uses the asterisk to import all names from a module, since they can't detect when names are left undefined -- asterisks are greedy and will eat all names. If one avoids the asterik and attempts to import explicitly, like in `from fontTools.misc.py23 import basestring`, the problem then is that, if `py23` does not re-define the name -- e.g. under python2 `basestring` or `unicode` are built-ins -- then the import statement raises `ImportError`. The same happens for the `unichr` function on a "wide" Python 2 build (in which `sys.maxunicode == 0x10FFFF`). Now, to work around this, we need to re-assign those built-ins to their very same names. This may look silly, but at least it works. 2015-11-23 12:02:12 +00:00			`unicode = unicode`
Add py23 compat layer 2013-11-27 14:36:57 -05:00			`except NameError:`
			`unicode = str`

			`try:`
[py23] fix ImportError when trying to import `unichr`, `basestring` or `unicode` when already defined When one does `from fontTools.misc.py23 import *`, everything seems to work fine. However, linters will complain when one uses the asterisk to import all names from a module, since they can't detect when names are left undefined -- asterisks are greedy and will eat all names. If one avoids the asterik and attempts to import explicitly, like in `from fontTools.misc.py23 import basestring`, the problem then is that, if `py23` does not re-define the name -- e.g. under python2 `basestring` or `unicode` are built-ins -- then the import statement raises `ImportError`. The same happens for the `unichr` function on a "wide" Python 2 build (in which `sys.maxunicode == 0x10FFFF`). Now, to work around this, we need to re-assign those built-ins to their very same names. This may look silly, but at least it works. 2015-11-23 12:02:12 +00:00			`unichr = unichr`
[py23] simulate "wide" unichr and ord on "narrow" UCS2-only Python 2 2015-05-08 19:28:42 +01:00
			`if sys.maxunicode < 0x10FFFF:`
			`# workarounds for Python 2 "narrow" builds with UCS2-only support.`

			`_narrow_unichr = unichr`

			`def unichr(i):`
			`"""`
			`Return the unicode character whose Unicode code is the integer 'i'.`
			`The valid range is 0 to 0x10FFFF inclusive.`

			`>>> _narrow_unichr(0xFFFF + 1)`
			`Traceback (most recent call last):`
			`File "<stdin>", line 1, in ?`
			`ValueError: unichr() arg not in range(0x10000) (narrow Python build)`
			`>>> unichr(0xFFFF + 1) == u'\U00010000'`
			`True`
			`>>> unichr(1114111) == u'\U0010FFFF'`
			`True`
			`>>> unichr(0x10FFFF + 1)`
			`Traceback (most recent call last):`
			`File "<stdin>", line 1, in ?`
			`ValueError: unichr() arg not in range(0x110000)`
			`"""`
			`try:`
			`return _narrow_unichr(i)`
			`except ValueError:`
			`try:`
			`padded_hex_str = hex(i)[2:].zfill(8)`
			`escape_str = "\\U" + padded_hex_str`
			`return escape_str.decode("unicode-escape")`
			`except UnicodeDecodeError:`
			`raise ValueError('unichr() arg not in range(0x110000)')`

			`import re`
			`_unicode_escape_RE = re.compile(r'\\U[A-Fa-f0-9]{8}')`

			`def byteord(c):`
			`"""`
			`Given a 8-bit or unicode character, return an integer representing the`
			`Unicode code point of the character. If a unicode argument is given, the`
			`character's code point must be in the range 0 to 0x10FFFF inclusive.`

			`>>> ord(u'\U00010000')`
			`Traceback (most recent call last):`
			`File "<stdin>", line 1, in ?`
			`TypeError: ord() expected a character, but string of length 2 found`
			`>>> byteord(u'\U00010000') == 0xFFFF + 1`
			`True`
			`>>> byteord(u'\U0010FFFF') == 1114111`
			`True`
			`"""`
			`try:`
			`return ord(c)`
			`except TypeError as e:`
			`try:`
			`escape_str = c.encode('unicode-escape')`
			`if not _unicode_escape_RE.match(escape_str):`
			`raise`
			`hex_str = escape_str[3:]`
			`return int(hex_str, 16)`
			`except:`
			`raise TypeError(e)`

			`else:`
			`byteord = ord`
Add py23 compat layer 2013-11-27 14:36:57 -05:00			`bytechr = chr`
[py23] simulate "wide" unichr and ord on "narrow" UCS2-only Python 2 2015-05-08 19:28:42 +01:00
			`except NameError:`
Add py23 compat layer 2013-11-27 14:36:57 -05:00			`unichr = chr`
			`def bytechr(n):`
			`return bytes([n])`
py23 introduce byteord() and use it 2013-11-27 18:13:48 -05:00			`def byteord(c):`
py23 Fix up byteord() implementation 2013-11-27 21:13:05 -05:00			`return c if isinstance(c, int) else ord(c)`
Add py23 compat layer 2013-11-27 14:36:57 -05:00
[py23] define BytesIO, StringIO and 'UnicodeIO' to disambiguate bytes vs unicode in-memory streams 2015-08-07 15:44:58 +01:00
			`# the 'io' module provides the same I/O interface on both 2 and 3.`
			`# here we define an alias of io.StringIO to disambiguate it eternally...`
			`from io import BytesIO`
			`from io import StringIO as UnicodeIO`
Add py23 compat layer 2013-11-27 14:36:57 -05:00			`try:`
[py23] define BytesIO, StringIO and 'UnicodeIO' to disambiguate bytes vs unicode in-memory streams 2015-08-07 15:44:58 +01:00			`# in python 2, by 'StringIO' we still mean a stream of byte strings`
Use pure-Python StringIO, not cStringIO misc.psLib tries to subclass StringIO. It doesn't work with the cStringIO version. Change doesn't seem to affect performance of CFF, which is the biggest StringIO user. https://github.com/behdad/fonttools/commit/7279302238a2d57d609cab45934615c4d959c88c#commitcomment-4767054 2013-12-04 04:11:06 -05:00			`from StringIO import StringIO`
Add py23 compat layer 2013-11-27 14:36:57 -05:00			`except ImportError:`
[py23] define BytesIO, StringIO and 'UnicodeIO' to disambiguate bytes vs unicode in-memory streams 2015-08-07 15:44:58 +01:00			`# in Python 3, we mean instead a stream of unicode strings`
			`StringIO = UnicodeIO`

py23 Introduce Tag to autoconvert tag types 2013-11-27 16:44:53 -05:00
Fix XMLWriter to take sinks that accept both bytes() and unicodes() Fix xmlWriter_test with python3 as well. 2015-04-14 19:07:34 -07:00			`def strjoin(iterable, joiner=''):`
			`return tostr(joiner).join(iterable)`
[py23] Minor refactoring 2014-07-21 13:19:53 -04:00
Revamp name table Unicode handling some more Part of https://github.com/behdad/fonttools/issues/236 Now we fallback to ASCII for unknown encodings. Not sure if this might be a bad idea. The main user-visible difference is that if there's an ASCII-only text in an unknown encoding, we still "decode" it and use unicode="True" instead of unicode="False". Or is assuming that any unsupported encoding is ASCII-compatible too intrusive? 2015-04-16 17:09:49 -07:00			`def tobytes(s, encoding='ascii', errors='strict'):`
[py23] Minor refactoring 2014-07-21 13:19:53 -04:00			`if not isinstance(s, bytes):`
Revamp name table Unicode handling some more Part of https://github.com/behdad/fonttools/issues/236 Now we fallback to ASCII for unknown encodings. Not sure if this might be a bad idea. The main user-visible difference is that if there's an ASCII-only text in an unknown encoding, we still "decode" it and use unicode="True" instead of unicode="False". Or is assuming that any unsupported encoding is ASCII-compatible too intrusive? 2015-04-16 17:09:49 -07:00			`return s.encode(encoding, errors)`
[py23] Minor refactoring 2014-07-21 13:19:53 -04:00			`else:`
			`return s`
Revamp name table Unicode handling some more Part of https://github.com/behdad/fonttools/issues/236 Now we fallback to ASCII for unknown encodings. Not sure if this might be a bad idea. The main user-visible difference is that if there's an ASCII-only text in an unknown encoding, we still "decode" it and use unicode="True" instead of unicode="False". Or is assuming that any unsupported encoding is ASCII-compatible too intrusive? 2015-04-16 17:09:49 -07:00			`def tounicode(s, encoding='ascii', errors='strict'):`
[py23] Minor refactoring 2014-07-21 13:19:53 -04:00			`if not isinstance(s, unicode):`
Revamp name table Unicode handling some more Part of https://github.com/behdad/fonttools/issues/236 Now we fallback to ASCII for unknown encodings. Not sure if this might be a bad idea. The main user-visible difference is that if there's an ASCII-only text in an unknown encoding, we still "decode" it and use unicode="True" instead of unicode="False". Or is assuming that any unsupported encoding is ASCII-compatible too intrusive? 2015-04-16 17:09:49 -07:00			`return s.decode(encoding, errors)`
[py23] Minor refactoring 2014-07-21 13:19:53 -04:00			`else:`
			`return s`

py23 Introduce Tag to autoconvert tag types 2013-11-27 16:44:53 -05:00			`if str == bytes:`
			`class Tag(str):`
			`def tobytes(self):`
			`if isinstance(self, bytes):`
			`return self`
			`else:`
s/latin-1/latin1/g 2013-11-28 06:46:59 -05:00			`return self.encode('latin1')`
py23 tostr()/tobytes() and using them ttf->xml seems to be mostly working now. 2013-11-27 19:51:59 -05:00
[py23] Minor refactoring 2014-07-21 13:19:53 -04:00			`tostr = tobytes`
py23 Add bytesjoin() 2013-11-27 21:09:03 -05:00
ps23 More bytes fixes. All ''join()'s fixed 2013-11-27 21:17:35 -05:00			`bytesjoin = strjoin`
py23 Introduce Tag to autoconvert tag types 2013-11-27 16:44:53 -05:00			`else:`
			`class Tag(str):`

			`@staticmethod`
			`def transcode(blob):`
			`if not isinstance(blob, str):`
			`blob = blob.decode('latin-1')`
			`return blob`

			`def __new__(self, content):`
			`return str.__new__(self, self.transcode(content))`
Implement __ne__ when __eq__ is defined 2013-12-06 22:25:48 -05:00			`def __ne__(self, other):`
			`return not self.__eq__(other)`
py23 Introduce Tag to autoconvert tag types 2013-11-27 16:44:53 -05:00			`def __eq__(self, other):`
			`return str.__eq__(self, self.transcode(other))`

			`def __hash__(self):`
			`return str.__hash__(self)`

			`def tobytes(self):`
			`return self.encode('latin-1')`
py23 tostr()/tobytes() and using them ttf->xml seems to be mostly working now. 2013-11-27 19:51:59 -05:00
[py23] Minor refactoring 2014-07-21 13:19:53 -04:00			`tostr = tounicode`
py23 Add bytesjoin() 2013-11-27 21:09:03 -05:00
Fix XMLWriter to take sinks that accept both bytes() and unicodes() Fix xmlWriter_test with python3 as well. 2015-04-14 19:07:34 -07:00			`def bytesjoin(iterable, joiner=b''):`
			`return tobytes(joiner).join(tobytes(item) for item in iterable)`
[py23] simulate "wide" unichr and ord on "narrow" UCS2-only Python 2 2015-05-08 19:28:42 +01:00

			`if __name__ == "__main__":`
			`import doctest, sys`
			`sys.exit(doctest.testmod().failed)`