[unicodedata] add block and script_extension functions
This commit is contained in:
parent
8b50ed56d9
commit
697b8d9af5
@ -8,11 +8,11 @@ try:
|
||||
# use unicodedata backport compatible with python2:
|
||||
# https://github.com/mikekap/unicodedata2
|
||||
from unicodedata2 import *
|
||||
except ImportError:
|
||||
except ImportError: # pragma: no cover
|
||||
# fall back to built-in unicodedata (possibly outdated)
|
||||
from unicodedata import *
|
||||
|
||||
from .scripts import SCRIPT_RANGES, SCRIPT_NAMES
|
||||
from . import Blocks, Scripts, ScriptExtensions
|
||||
|
||||
|
||||
__all__ = [
|
||||
@ -32,11 +32,23 @@ __all__ = [
|
||||
"unidata_version",
|
||||
"ucd_3_2_0",
|
||||
# additonal functions
|
||||
"block",
|
||||
"script",
|
||||
"script_extension",
|
||||
]
|
||||
|
||||
|
||||
def script(char):
|
||||
""" Return the script property assigned to the Unicode character 'char'
|
||||
as string.
|
||||
|
||||
>>> script("a")
|
||||
'Latin'
|
||||
>>> script(",")
|
||||
'Common'
|
||||
>>> script(unichr(0x10FFFF))
|
||||
'Unknown'
|
||||
"""
|
||||
code = byteord(char)
|
||||
# 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
|
||||
# comes after (to the right of) any existing entries of x in a, and it
|
||||
@ -47,5 +59,42 @@ def script(char):
|
||||
# breakpoints); we want to use `bisect_right` to look up the range that
|
||||
# contains the given codepoint: i.e. whose start is less than or equal
|
||||
# to the codepoint. Thus, we subtract -1 from the index returned.
|
||||
i = bisect_right(SCRIPT_RANGES, code)
|
||||
return SCRIPT_NAMES[i-1]
|
||||
i = bisect_right(Scripts.RANGES, code)
|
||||
return Scripts.VALUES[i-1]
|
||||
|
||||
|
||||
def script_extension(char):
|
||||
""" Return the script extension property assigned to the Unicode character
|
||||
'char' as a set of string.
|
||||
|
||||
>>> script_extension("a") == {'Latin'}
|
||||
True
|
||||
>>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
|
||||
True
|
||||
>>> script_extension(unichr(0x10FFFF)) == {'Unknown'}
|
||||
True
|
||||
"""
|
||||
code = byteord(char)
|
||||
i = bisect_right(ScriptExtensions.RANGES, code)
|
||||
value = ScriptExtensions.VALUES[i-1]
|
||||
if value is None:
|
||||
# code points not explicitly listed for Script Extensions
|
||||
# have as their value the corresponding Script property value
|
||||
return {script(char)}
|
||||
return value
|
||||
|
||||
|
||||
def block(char):
|
||||
""" Return the block property assigned to the Unicode character 'char'
|
||||
as a string.
|
||||
|
||||
>>> block("a")
|
||||
'Basic Latin'
|
||||
>>> block(unichr(0x060C))
|
||||
'Arabic'
|
||||
>>> block(unichr(0xEFFFF))
|
||||
'No_Block'
|
||||
"""
|
||||
code = byteord(char)
|
||||
i = bisect_right(Blocks.RANGES, code)
|
||||
return Blocks.VALUES[i-1]
|
||||
|
Loading…
x
Reference in New Issue
Block a user