[unicodedata] add block and script_extension functions

2017-11-20 18:16:02 +01:00 · 2017-11-20 18:16:02 +01:00 · 697b8d9af5
commit 697b8d9af5
parent 8b50ed56d9
1 changed files with 53 additions and 4 deletions
--- a/Lib/fontTools/unicodedata/init.py
+++ b/Lib/fontTools/unicodedata/init.py
@ -8,11 +8,11 @@ try:
    # use unicodedata backport compatible with python2:
    # https://github.com/mikekap/unicodedata2
    from unicodedata2 import *
-except ImportError:
+except ImportError:  # pragma: no cover
    # fall back to built-in unicodedata (possibly outdated)
    from unicodedata import *

-from .scripts import SCRIPT_RANGES, SCRIPT_NAMES
+from . import Blocks, Scripts, ScriptExtensions


 __all__ = [
@ -32,11 +32,23 @@ __all__ = [
    "unidata_version",
    "ucd_3_2_0",
    # additonal functions
+    "block",
    "script",
+    "script_extension",
 ]


 def script(char):
+    """ Return the script property assigned to the Unicode character 'char'
+    as string.
+
+    >>> script("a")
+    'Latin'
+    >>> script(",")
+    'Common'
+    >>> script(unichr(0x10FFFF))
+    'Unknown'
+    """
    code = byteord(char)
    # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
    # comes after (to the right of) any existing entries of x in a, and it
@ -47,5 +59,42 @@ def script(char):
    # breakpoints); we want to use `bisect_right` to look up the range that
    # contains the given codepoint: i.e. whose start is less than or equal
    # to the codepoint. Thus, we subtract -1 from the index returned.
-    i = bisect_right(SCRIPT_RANGES, code)
-    return SCRIPT_NAMES[i-1]
+    i = bisect_right(Scripts.RANGES, code)
+    return Scripts.VALUES[i-1]
+
+
+def script_extension(char):
+    """ Return the script extension property assigned to the Unicode character
+    'char' as a set of string.
+
+    >>> script_extension("a") == {'Latin'}
+    True
+    >>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
+    True
+    >>> script_extension(unichr(0x10FFFF)) == {'Unknown'}
+    True
+    """
+    code = byteord(char)
+    i = bisect_right(ScriptExtensions.RANGES, code)
+    value = ScriptExtensions.VALUES[i-1]
+    if value is None:
+        # code points not explicitly listed for Script Extensions
+        # have as their value the corresponding Script property value
+        return {script(char)}
+    return value
+
+
+def block(char):
+    """ Return the block property assigned to the Unicode character 'char'
+    as a string.
+
+    >>> block("a")
+    'Basic Latin'
+    >>> block(unichr(0x060C))
+    'Arabic'
+    >>> block(unichr(0xEFFFF))
+    'No_Block'
+    """
+    code = byteord(char)
+    i = bisect_right(Blocks.RANGES, code)
+    return Blocks.VALUES[i-1]