[unicodedata] add new module and 'script' function

The new `fontTools.unicodedata` module re-exports all the public functions from the built-in `unicodedata` module, and also adds additional functions. The `script` function takes a unicode character and returns the script name as defined in the UCD "Script.txt" data file. It's implemented as a simple binary search, plus a memoizing decorator that caches the results to avoid search the same character more than once. The unicodedata2 backport is imported if present, otherwise the unicodedata built-in is used.
2017-11-17 19:17:17 +00:00 · 2017-11-17 19:17:17 +00:00 · 52d6131525
commit 52d6131525
parent 96dafe4afc
1 changed files with 71 additions and 0 deletions
--- a/Lib/fontTools/unicodedata/init.py
+++ b/Lib/fontTools/unicodedata/init.py
@ -0,0 +1,71 @@
+from __future__ import (
+    print_function, division, absolute_import, unicode_literals)
+from fontTools.misc.py23 import *
+
+import functools
+
+try:
+    # use unicodedata backport compatible with python2:
+    # https://github.com/mikekap/unicodedata2
+    from unicodedata2 import *
+except ImportError:
+    # fall back to built-in unicodedata (possibly outdated)
+    from unicodedata import *
+
+from .scripts import SCRIPT_RANGES
+
+
+__all__ = [
+    # names from built-in unicodedata module
+    "lookup",
+    "name",
+    "decimal",
+    "digit",
+    "numeric",
+    "category",
+    "bidirectional",
+    "combining",
+    "east_asian_width",
+    "mirrored",
+    "decomposition",
+    "normalize",
+    "unidata_version",
+    "ucd_3_2_0",
+    # additonal functions
+    "script",
+]
+
+
+def _memoize(func):
+    # Decorator that caches a function's return value each time it is
+    # called, and returns the cached value if called later with the same
+    # argument.
+    cache = func.cache = {}
+
+    @functools.wraps(func)
+    def wrapper(arg):
+        if arg not in cache:
+            cache[arg] = func(arg)
+        return cache[arg]
+    return wrapper
+
+
+@_memoize
+def script(char):
+    """For the unicode character 'char' return the script name."""
+    code = byteord(char)
+    return _binary_search_range(code, SCRIPT_RANGES, default="Unknown")
+
+
+def _binary_search_range(code, ranges, default=None):
+    left = 0
+    right = len(ranges) - 1
+    while right >= left:
+        mid = (left + right) >> 1
+        if code < ranges[mid][0]:
+            right = mid - 1
+        elif code > ranges[mid][1]:
+            left = mid + 1
+        else:
+            return ranges[mid][2]
+    return default