From 697b8d9af534c34a1febd3c6e3807a2abcbfb078 Mon Sep 17 00:00:00 2001
From: Cosimo Lupo <cosimo@anthrotype.com>
Date: Mon, 20 Nov 2017 18:16:02 +0100
Subject: [PATCH] [unicodedata] add block and script_extension functions

---
 Lib/fontTools/unicodedata/__init__.py | 57 +++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/Lib/fontTools/unicodedata/__init__.py b/Lib/fontTools/unicodedata/__init__.py
index f0c22e4d9..4c6e346aa 100644
--- a/Lib/fontTools/unicodedata/__init__.py
+++ b/Lib/fontTools/unicodedata/__init__.py
@@ -8,11 +8,11 @@ try:
     # use unicodedata backport compatible with python2:
     # https://github.com/mikekap/unicodedata2
     from unicodedata2 import *
-except ImportError:
+except ImportError:  # pragma: no cover
     # fall back to built-in unicodedata (possibly outdated)
     from unicodedata import *
 
-from .scripts import SCRIPT_RANGES, SCRIPT_NAMES
+from . import Blocks, Scripts, ScriptExtensions
 
 
 __all__ = [
@@ -32,11 +32,23 @@ __all__ = [
     "unidata_version",
     "ucd_3_2_0",
     # additonal functions
+    "block",
     "script",
+    "script_extension",
 ]
 
 
 def script(char):
+    """ Return the script property assigned to the Unicode character 'char'
+    as string.
+
+    >>> script("a")
+    'Latin'
+    >>> script(",")
+    'Common'
+    >>> script(unichr(0x10FFFF))
+    'Unknown'
+    """
     code = byteord(char)
     # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
     # comes after (to the right of) any existing entries of x in a, and it
@@ -47,5 +59,42 @@ def script(char):
     # breakpoints); we want to use `bisect_right` to look up the range that
     # contains the given codepoint: i.e. whose start is less than or equal
     # to the codepoint. Thus, we subtract -1 from the index returned.
-    i = bisect_right(SCRIPT_RANGES, code)
-    return SCRIPT_NAMES[i-1]
+    i = bisect_right(Scripts.RANGES, code)
+    return Scripts.VALUES[i-1]
+
+
+def script_extension(char):
+    """ Return the script extension property assigned to the Unicode character
+    'char' as a set of string.
+
+    >>> script_extension("a") == {'Latin'}
+    True
+    >>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
+    True
+    >>> script_extension(unichr(0x10FFFF)) == {'Unknown'}
+    True
+    """
+    code = byteord(char)
+    i = bisect_right(ScriptExtensions.RANGES, code)
+    value = ScriptExtensions.VALUES[i-1]
+    if value is None:
+        # code points not explicitly listed for Script Extensions
+        # have as their value the corresponding Script property value
+        return {script(char)}
+    return value
+
+
+def block(char):
+    """ Return the block property assigned to the Unicode character 'char'
+    as a string.
+
+    >>> block("a")
+    'Basic Latin'
+    >>> block(unichr(0x060C))
+    'Arabic'
+    >>> block(unichr(0xEFFFF))
+    'No_Block'
+    """
+    code = byteord(char)
+    i = bisect_right(Blocks.RANGES, code)
+    return Blocks.VALUES[i-1]