[unicodedata] add block and script_extension functions

This commit is contained in:
Cosimo Lupo 2017-11-20 18:16:02 +01:00
parent 8b50ed56d9
commit 697b8d9af5

View File

@ -8,11 +8,11 @@ try:
# use unicodedata backport compatible with python2:
# https://github.com/mikekap/unicodedata2
from unicodedata2 import *
except ImportError:
except ImportError: # pragma: no cover
# fall back to built-in unicodedata (possibly outdated)
from unicodedata import *
from .scripts import SCRIPT_RANGES, SCRIPT_NAMES
from . import Blocks, Scripts, ScriptExtensions
__all__ = [
@ -32,11 +32,23 @@ __all__ = [
"unidata_version",
"ucd_3_2_0",
# additonal functions
"block",
"script",
"script_extension",
]
def script(char):
""" Return the script property assigned to the Unicode character 'char'
as string.
>>> script("a")
'Latin'
>>> script(",")
'Common'
>>> script(unichr(0x10FFFF))
'Unknown'
"""
code = byteord(char)
# 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
# comes after (to the right of) any existing entries of x in a, and it
@ -47,5 +59,42 @@ def script(char):
# breakpoints); we want to use `bisect_right` to look up the range that
# contains the given codepoint: i.e. whose start is less than or equal
# to the codepoint. Thus, we subtract -1 from the index returned.
i = bisect_right(SCRIPT_RANGES, code)
return SCRIPT_NAMES[i-1]
i = bisect_right(Scripts.RANGES, code)
return Scripts.VALUES[i-1]
def script_extension(char):
""" Return the script extension property assigned to the Unicode character
'char' as a set of string.
>>> script_extension("a") == {'Latin'}
True
>>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
True
>>> script_extension(unichr(0x10FFFF)) == {'Unknown'}
True
"""
code = byteord(char)
i = bisect_right(ScriptExtensions.RANGES, code)
value = ScriptExtensions.VALUES[i-1]
if value is None:
# code points not explicitly listed for Script Extensions
# have as their value the corresponding Script property value
return {script(char)}
return value
def block(char):
""" Return the block property assigned to the Unicode character 'char'
as a string.
>>> block("a")
'Basic Latin'
>>> block(unichr(0x060C))
'Arabic'
>>> block(unichr(0xEFFFF))
'No_Block'
"""
code = byteord(char)
i = bisect_right(Blocks.RANGES, code)
return Blocks.VALUES[i-1]