fonttools/Lib/fontTools/unicodedata/__init__.py

from __future__ import (
    print_function, division, absolute_import, unicode_literals)
from fontTools.misc.py23 import *

from bisect import bisect_right

try:
    # use unicodedata backport compatible with python2:
    # https://github.com/mikekap/unicodedata2
    from unicodedata2 import *
except ImportError:  # pragma: no cover
    # fall back to built-in unicodedata (possibly outdated)
    from unicodedata import *

from . import Blocks, Scripts, ScriptExtensions


__all__ = [
    # names from built-in unicodedata module
    "lookup",
    "name",
    "decimal",
    "digit",
    "numeric",
    "category",
    "bidirectional",
    "combining",
    "east_asian_width",
    "mirrored",
    "decomposition",
    "normalize",
    "unidata_version",
    "ucd_3_2_0",
    # additonal functions
    "block",
    "script",
    "script_extension",
]


def script(char):
    """ Return the four-letter script code assigned to the Unicode character
    'char' as string.

    >>> script("a")
    'Latn'
    >>> script(",")
    'Zyyy'
    >>> script(unichr(0x10FFFF))
    'Zzzz'
    """
    code = byteord(char)
    # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
    # comes after (to the right of) any existing entries of x in a, and it
    # partitions array a into two halves so that, for the left side
    # all(val <= x for val in a[lo:i]), and for the right side
    # all(val > x for val in a[i:hi]).
    # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
    # breakpoints); we want to use `bisect_right` to look up the range that
    # contains the given codepoint: i.e. whose start is less than or equal
    # to the codepoint. Thus, we subtract -1 from the index returned.
    i = bisect_right(Scripts.RANGES, code)
    return Scripts.VALUES[i-1]


def script_extension(char):
    """ Return the script extension property assigned to the Unicode character
    'char' as a set of string.

    >>> script_extension("a") == {'Latn'}
    True
    >>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
    True
    >>> script_extension(unichr(0x10FFFF)) == {'Zzzz'}
    True
    """
    code = byteord(char)
    i = bisect_right(ScriptExtensions.RANGES, code)
    value = ScriptExtensions.VALUES[i-1]
    if value is None:
        # code points not explicitly listed for Script Extensions
        # have as their value the corresponding Script property value
        return {script(char)}
    return value


def script_name(code):
    """ Return the long, human-readable script name given a four-letter
    Unicode script code.

    Raises KeyError if no matching name is found.
    """
    return Scripts.NAMES[code].replace("_", " ")


def block(char):
    """ Return the block property assigned to the Unicode character 'char'
    as a string.

    >>> block("a")
    'Basic Latin'
    >>> block(unichr(0x060C))
    'Arabic'
    >>> block(unichr(0xEFFFF))
    'No_Block'
    """
    code = byteord(char)
    i = bisect_right(Blocks.RANGES, code)
    return Blocks.VALUES[i-1]
[unicodedata] add new module and 'script' function The new `fontTools.unicodedata` module re-exports all the public functions from the built-in `unicodedata` module, and also adds additional functions. The `script` function takes a unicode character and returns the script name as defined in the UCD "Script.txt" data file. It's implemented as a simple binary search, plus a memoizing decorator that caches the results to avoid search the same character more than once. The unicodedata2 backport is imported if present, otherwise the unicodedata built-in is used. 2017-11-17 19:17:17 +00:00			`from __future__ import (`
			`print_function, division, absolute_import, unicode_literals)`
			`from fontTools.misc.py23 import *`

[unicodedata] use bisect.bisect_right function CPython comes with a fast C implementation of bisect module. This gives 4 to 5 times speed-ups over my pure-python version. 2017-11-20 13:30:17 +01:00			`from bisect import bisect_right`
[unicodedata] add new module and 'script' function The new `fontTools.unicodedata` module re-exports all the public functions from the built-in `unicodedata` module, and also adds additional functions. The `script` function takes a unicode character and returns the script name as defined in the UCD "Script.txt" data file. It's implemented as a simple binary search, plus a memoizing decorator that caches the results to avoid search the same character more than once. The unicodedata2 backport is imported if present, otherwise the unicodedata built-in is used. 2017-11-17 19:17:17 +00:00
			`try:`
			`# use unicodedata backport compatible with python2:`
			`# https://github.com/mikekap/unicodedata2`
			`from unicodedata2 import *`
[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`except ImportError: # pragma: no cover`
[unicodedata] add new module and 'script' function The new `fontTools.unicodedata` module re-exports all the public functions from the built-in `unicodedata` module, and also adds additional functions. The `script` function takes a unicode character and returns the script name as defined in the UCD "Script.txt" data file. It's implemented as a simple binary search, plus a memoizing decorator that caches the results to avoid search the same character more than once. The unicodedata2 backport is imported if present, otherwise the unicodedata built-in is used. 2017-11-17 19:17:17 +00:00			`# fall back to built-in unicodedata (possibly outdated)`
			`from unicodedata import *`

[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`from . import Blocks, Scripts, ScriptExtensions`
[unicodedata] add new module and 'script' function The new `fontTools.unicodedata` module re-exports all the public functions from the built-in `unicodedata` module, and also adds additional functions. The `script` function takes a unicode character and returns the script name as defined in the UCD "Script.txt" data file. It's implemented as a simple binary search, plus a memoizing decorator that caches the results to avoid search the same character more than once. The unicodedata2 backport is imported if present, otherwise the unicodedata built-in is used. 2017-11-17 19:17:17 +00:00

			`__all__ = [`
			`# names from built-in unicodedata module`
			`"lookup",`
			`"name",`
			`"decimal",`
			`"digit",`
			`"numeric",`
			`"category",`
			`"bidirectional",`
			`"combining",`
			`"east_asian_width",`
			`"mirrored",`
			`"decomposition",`
			`"normalize",`
			`"unidata_version",`
			`"ucd_3_2_0",`
			`# additonal functions`
[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`"block",`
[unicodedata] add new module and 'script' function The new `fontTools.unicodedata` module re-exports all the public functions from the built-in `unicodedata` module, and also adds additional functions. The `script` function takes a unicode character and returns the script name as defined in the UCD "Script.txt" data file. It's implemented as a simple binary search, plus a memoizing decorator that caches the results to avoid search the same character more than once. The unicodedata2 backport is imported if present, otherwise the unicodedata built-in is used. 2017-11-17 19:17:17 +00:00			`"script",`
[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`"script_extension",`
[unicodedata] add new module and 'script' function The new `fontTools.unicodedata` module re-exports all the public functions from the built-in `unicodedata` module, and also adds additional functions. The `script` function takes a unicode character and returns the script name as defined in the UCD "Script.txt" data file. It's implemented as a simple binary search, plus a memoizing decorator that caches the results to avoid search the same character more than once. The unicodedata2 backport is imported if present, otherwise the unicodedata built-in is used. 2017-11-17 19:17:17 +00:00			`]`


			`def script(char):`
[Tests] adjust unicodedata_test to expect short script codes 2017-11-22 16:50:55 +01:00			`""" Return the four-letter script code assigned to the Unicode character`
			`'char' as string.`
[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00
			`>>> script("a")`
[Tests] adjust unicodedata_test to expect short script codes 2017-11-22 16:50:55 +01:00			`'Latn'`
[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`>>> script(",")`
[Tests] adjust unicodedata_test to expect short script codes 2017-11-22 16:50:55 +01:00			`'Zyyy'`
[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`>>> script(unichr(0x10FFFF))`
[Tests] adjust unicodedata_test to expect short script codes 2017-11-22 16:50:55 +01:00			`'Zzzz'`
[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`"""`
[unicodedata] add new module and 'script' function The new `fontTools.unicodedata` module re-exports all the public functions from the built-in `unicodedata` module, and also adds additional functions. The `script` function takes a unicode character and returns the script name as defined in the UCD "Script.txt" data file. It's implemented as a simple binary search, plus a memoizing decorator that caches the results to avoid search the same character more than once. The unicodedata2 backport is imported if present, otherwise the unicodedata built-in is used. 2017-11-17 19:17:17 +00:00			`code = byteord(char)`
[unicodedata] use bisect.bisect_right function CPython comes with a fast C implementation of bisect module. This gives 4 to 5 times speed-ups over my pure-python version. 2017-11-20 13:30:17 +01:00			`# 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which`
			`# comes after (to the right of) any existing entries of x in a, and it`
			`# partitions array a into two halves so that, for the left side`
			`# all(val <= x for val in a[lo:i]), and for the right side`
			`# all(val > x for val in a[i:hi]).`
			`# Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting`
			# breakpoints); we want to use `bisect_right` to look up the range that
			`# contains the given codepoint: i.e. whose start is less than or equal`
			`# to the codepoint. Thus, we subtract -1 from the index returned.`
[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`i = bisect_right(Scripts.RANGES, code)`
			`return Scripts.VALUES[i-1]`


			`def script_extension(char):`
			`""" Return the script extension property assigned to the Unicode character`
			`'char' as a set of string.`

[Tests] adjust unicodedata_test to expect short script codes 2017-11-22 16:50:55 +01:00			`>>> script_extension("a") == {'Latn'}`
[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`True`
			`>>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}`
			`True`
[Tests] adjust unicodedata_test to expect short script codes 2017-11-22 16:50:55 +01:00			`>>> script_extension(unichr(0x10FFFF)) == {'Zzzz'}`
[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`True`
			`"""`
			`code = byteord(char)`
			`i = bisect_right(ScriptExtensions.RANGES, code)`
			`value = ScriptExtensions.VALUES[i-1]`
			`if value is None:`
			`# code points not explicitly listed for Script Extensions`
			`# have as their value the corresponding Script property value`
			`return {script(char)}`
			`return value`


[unicodedata] add script_name function Converts four-letter script codes to human-readable long names 2017-11-22 16:57:14 +01:00			`def script_name(code):`
			`""" Return the long, human-readable script name given a four-letter`
			`Unicode script code.`

			`Raises KeyError if no matching name is found.`
			`"""`
			`return Scripts.NAMES[code].replace("_", " ")`


[unicodedata] add block and script_extension functions 2017-11-20 18:16:02 +01:00			`def block(char):`
			`""" Return the block property assigned to the Unicode character 'char'`
			`as a string.`

			`>>> block("a")`
			`'Basic Latin'`
			`>>> block(unichr(0x060C))`
			`'Arabic'`
			`>>> block(unichr(0xEFFFF))`
			`'No_Block'`
			`"""`
			`code = byteord(char)`
			`i = bisect_right(Blocks.RANGES, code)`
			`return Blocks.VALUES[i-1]`