2017-11-17 19:17:17 +00:00
|
|
|
from __future__ import (
|
|
|
|
print_function, division, absolute_import, unicode_literals)
|
|
|
|
from fontTools.misc.py23 import *
|
|
|
|
|
2017-11-20 13:30:17 +01:00
|
|
|
from bisect import bisect_right
|
2017-11-17 19:17:17 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
# use unicodedata backport compatible with python2:
|
|
|
|
# https://github.com/mikekap/unicodedata2
|
|
|
|
from unicodedata2 import *
|
2017-11-20 18:16:02 +01:00
|
|
|
except ImportError: # pragma: no cover
|
2017-11-17 19:17:17 +00:00
|
|
|
# fall back to built-in unicodedata (possibly outdated)
|
|
|
|
from unicodedata import *
|
|
|
|
|
2017-11-20 18:16:02 +01:00
|
|
|
from . import Blocks, Scripts, ScriptExtensions
|
2017-11-17 19:17:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
__all__ = [
|
|
|
|
# names from built-in unicodedata module
|
|
|
|
"lookup",
|
|
|
|
"name",
|
|
|
|
"decimal",
|
|
|
|
"digit",
|
|
|
|
"numeric",
|
|
|
|
"category",
|
|
|
|
"bidirectional",
|
|
|
|
"combining",
|
|
|
|
"east_asian_width",
|
|
|
|
"mirrored",
|
|
|
|
"decomposition",
|
|
|
|
"normalize",
|
|
|
|
"unidata_version",
|
|
|
|
"ucd_3_2_0",
|
|
|
|
# additonal functions
|
2017-11-20 18:16:02 +01:00
|
|
|
"block",
|
2017-11-17 19:17:17 +00:00
|
|
|
"script",
|
2017-11-20 18:16:02 +01:00
|
|
|
"script_extension",
|
2017-11-17 19:17:17 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def script(char):
|
2017-11-22 16:50:55 +01:00
|
|
|
""" Return the four-letter script code assigned to the Unicode character
|
|
|
|
'char' as string.
|
2017-11-20 18:16:02 +01:00
|
|
|
|
|
|
|
>>> script("a")
|
2017-11-22 16:50:55 +01:00
|
|
|
'Latn'
|
2017-11-20 18:16:02 +01:00
|
|
|
>>> script(",")
|
2017-11-22 16:50:55 +01:00
|
|
|
'Zyyy'
|
2017-11-20 18:16:02 +01:00
|
|
|
>>> script(unichr(0x10FFFF))
|
2017-11-22 16:50:55 +01:00
|
|
|
'Zzzz'
|
2017-11-20 18:16:02 +01:00
|
|
|
"""
|
2017-11-17 19:17:17 +00:00
|
|
|
code = byteord(char)
|
2017-11-20 13:30:17 +01:00
|
|
|
# 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
|
|
|
|
# comes after (to the right of) any existing entries of x in a, and it
|
|
|
|
# partitions array a into two halves so that, for the left side
|
|
|
|
# all(val <= x for val in a[lo:i]), and for the right side
|
|
|
|
# all(val > x for val in a[i:hi]).
|
|
|
|
# Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
|
|
|
|
# breakpoints); we want to use `bisect_right` to look up the range that
|
|
|
|
# contains the given codepoint: i.e. whose start is less than or equal
|
|
|
|
# to the codepoint. Thus, we subtract -1 from the index returned.
|
2017-11-20 18:16:02 +01:00
|
|
|
i = bisect_right(Scripts.RANGES, code)
|
|
|
|
return Scripts.VALUES[i-1]
|
|
|
|
|
|
|
|
|
|
|
|
def script_extension(char):
|
|
|
|
""" Return the script extension property assigned to the Unicode character
|
|
|
|
'char' as a set of string.
|
|
|
|
|
2017-11-22 16:50:55 +01:00
|
|
|
>>> script_extension("a") == {'Latn'}
|
2017-11-20 18:16:02 +01:00
|
|
|
True
|
|
|
|
>>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
|
|
|
|
True
|
2017-11-22 16:50:55 +01:00
|
|
|
>>> script_extension(unichr(0x10FFFF)) == {'Zzzz'}
|
2017-11-20 18:16:02 +01:00
|
|
|
True
|
|
|
|
"""
|
|
|
|
code = byteord(char)
|
|
|
|
i = bisect_right(ScriptExtensions.RANGES, code)
|
|
|
|
value = ScriptExtensions.VALUES[i-1]
|
|
|
|
if value is None:
|
|
|
|
# code points not explicitly listed for Script Extensions
|
|
|
|
# have as their value the corresponding Script property value
|
|
|
|
return {script(char)}
|
|
|
|
return value
|
|
|
|
|
|
|
|
|
2017-11-22 16:57:14 +01:00
|
|
|
def script_name(code):
|
|
|
|
""" Return the long, human-readable script name given a four-letter
|
|
|
|
Unicode script code.
|
|
|
|
|
|
|
|
Raises KeyError if no matching name is found.
|
|
|
|
"""
|
|
|
|
return Scripts.NAMES[code].replace("_", " ")
|
|
|
|
|
|
|
|
|
2017-11-20 18:16:02 +01:00
|
|
|
def block(char):
|
|
|
|
""" Return the block property assigned to the Unicode character 'char'
|
|
|
|
as a string.
|
|
|
|
|
|
|
|
>>> block("a")
|
|
|
|
'Basic Latin'
|
|
|
|
>>> block(unichr(0x060C))
|
|
|
|
'Arabic'
|
|
|
|
>>> block(unichr(0xEFFFF))
|
|
|
|
'No_Block'
|
|
|
|
"""
|
|
|
|
code = byteord(char)
|
|
|
|
i = bisect_right(Blocks.RANGES, code)
|
|
|
|
return Blocks.VALUES[i-1]
|