From 52d61315254223b5a19551985c84211a4868d841 Mon Sep 17 00:00:00 2001
From: Cosimo Lupo <cosimo@anthrotype.com>
Date: Fri, 17 Nov 2017 19:17:17 +0000
Subject: [PATCH] [unicodedata] add new module and 'script' function

The new `fontTools.unicodedata` module re-exports all the public
functions from the built-in `unicodedata` module, and also adds
additional functions.

The `script` function takes a unicode character and returns the
script name as defined in the UCD "Script.txt" data file.

It's implemented as a simple binary search, plus a memoizing
decorator that caches the results to avoid search the same
character more than once.

The unicodedata2 backport is imported if present, otherwise
the unicodedata built-in is used.
---
 Lib/fontTools/unicodedata/__init__.py | 71 +++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 Lib/fontTools/unicodedata/__init__.py

diff --git a/Lib/fontTools/unicodedata/__init__.py b/Lib/fontTools/unicodedata/__init__.py
new file mode 100644
index 000000000..e5d3189ee
--- /dev/null
+++ b/Lib/fontTools/unicodedata/__init__.py
@@ -0,0 +1,71 @@
+from __future__ import (
+    print_function, division, absolute_import, unicode_literals)
+from fontTools.misc.py23 import *
+
+import functools
+
+try:
+    # use unicodedata backport compatible with python2:
+    # https://github.com/mikekap/unicodedata2
+    from unicodedata2 import *
+except ImportError:
+    # fall back to built-in unicodedata (possibly outdated)
+    from unicodedata import *
+
+from .scripts import SCRIPT_RANGES
+
+
+__all__ = [
+    # names from built-in unicodedata module
+    "lookup",
+    "name",
+    "decimal",
+    "digit",
+    "numeric",
+    "category",
+    "bidirectional",
+    "combining",
+    "east_asian_width",
+    "mirrored",
+    "decomposition",
+    "normalize",
+    "unidata_version",
+    "ucd_3_2_0",
+    # additonal functions
+    "script",
+]
+
+
+def _memoize(func):
+    # Decorator that caches a function's return value each time it is
+    # called, and returns the cached value if called later with the same
+    # argument.
+    cache = func.cache = {}
+
+    @functools.wraps(func)
+    def wrapper(arg):
+        if arg not in cache:
+            cache[arg] = func(arg)
+        return cache[arg]
+    return wrapper
+
+
+@_memoize
+def script(char):
+    """For the unicode character 'char' return the script name."""
+    code = byteord(char)
+    return _binary_search_range(code, SCRIPT_RANGES, default="Unknown")
+
+
+def _binary_search_range(code, ranges, default=None):
+    left = 0
+    right = len(ranges) - 1
+    while right >= left:
+        mid = (left + right) >> 1
+        if code < ranges[mid][0]:
+            right = mid - 1
+        elif code > ranges[mid][1]:
+            left = mid + 1
+        else:
+            return ranges[mid][2]
+    return default