[unicodedata] use bisect.bisect_right function

CPython comes with a fast C implementation of bisect module. This gives 4 to 5 times speed-ups over my pure-python version.
2017-11-20 13:30:17 +01:00 · 2017-11-20 13:30:17 +01:00 · 3442da1529
commit 3442da1529
parent 5b3c189f6d
1 changed files with 13 additions and 33 deletions
--- a/Lib/fontTools/unicodedata/init.py
+++ b/Lib/fontTools/unicodedata/init.py
@ -2,7 +2,7 @@ from __future__ import (
    print_function, division, absolute_import, unicode_literals)
 from fontTools.misc.py23 import *

-import functools
+from bisect import bisect_right

 try:
    # use unicodedata backport compatible with python2:
@ -12,7 +12,7 @@ except ImportError:
    # fall back to built-in unicodedata (possibly outdated)
    from unicodedata import *

-from .scripts import SCRIPT_RANGES
+from .scripts import SCRIPT_RANGES, SCRIPT_NAMES


 __all__ = [
@ -36,36 +36,16 @@ __all__ = [
 ]


-def _memoize(func):
-    # Decorator that caches a function's return value each time it is
-    # called, and returns the cached value if called later with the same
-    # argument.
-    cache = func.cache = {}
-
-    @functools.wraps(func)
-    def wrapper(arg):
-        if arg not in cache:
-            cache[arg] = func(arg)
-        return cache[arg]
-    return wrapper
-
-
-@_memoize
 def script(char):
-    """For the unicode character 'char' return the script name."""
    code = byteord(char)
-    return _binary_search_range(code, SCRIPT_RANGES, default="Unknown")
-
-
-def _binary_search_range(code, ranges, default=None):
-    left = 0
-    right = len(ranges) - 1
-    while right >= left:
-        mid = (left + right) >> 1
-        if code < ranges[mid][0]:
-            right = mid - 1
-        elif code > ranges[mid][1]:
-            left = mid + 1
-        else:
-            return ranges[mid][2]
-    return default
+    # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
+    # comes after (to the right of) any existing entries of x in a, and it
+    # partitions array a into two halves so that, for the left side
+    # all(val <= x for val in a[lo:i]), and for the right side
+    # all(val > x for val in a[i:hi]).
+    # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
+    # breakpoints); we want to use `bisect_right` to look up the range that
+    # contains the given codepoint: i.e. whose start is less than or equal
+    # to the codepoint. Thus, we subtract -1 from the index returned.
+    i = bisect_right(SCRIPT_RANGES, code)
+    return SCRIPT_NAMES[i-1]