[unicodedata] use bisect.bisect_right function

CPython comes with a fast C implementation of bisect module. This gives 4 to 5 times speed-ups over my pure-python version.
2017-11-20 13:30:17 +01:00 · 2017-11-20 13:30:17 +01:00 · 3442da1529
commit 3442da1529
parent 5b3c189f6d
1 changed files with 13 additions and 33 deletions
--- a/Lib/fontTools/unicodedata/init.py
+++ b/Lib/fontTools/unicodedata/init.py
@ -2,7 +2,7 @@ from __future__ import (
    print_function, division, absolute_import, unicode_literals)
 from fontTools.misc.py23 import *
-import functools
+from bisect import bisect_right
 try:
    # use unicodedata backport compatible with python2:
@ -12,7 +12,7 @@ except ImportError:
    # fall back to built-in unicodedata (possibly outdated)
    from unicodedata import *
-from .scripts import SCRIPT_RANGES
+from .scripts import SCRIPT_RANGES, SCRIPT_NAMES
 __all__ = [
@ -36,36 +36,16 @@ __all__ = [
 ]
 def _memoize(func):
    # Decorator that caches a function's return value each time it is
    # called, and returns the cached value if called later with the same
    # argument.
    cache = func.cache = {}
    @functools.wraps(func)
    def wrapper(arg):
        if arg not in cache:
            cache[arg] = func(arg)
        return cache[arg]
    return wrapper
@_memoize
 def script(char):
    """For the unicode character 'char' return the script name."""
    code = byteord(char)
-    return _binary_search_range(code, SCRIPT_RANGES, default="Unknown")
+    # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
-
+    # comes after (to the right of) any existing entries of x in a, and it
-
+    # partitions array a into two halves so that, for the left side
-def _binary_search_range(code, ranges, default=None):
+    # all(val <= x for val in a[lo:i]), and for the right side
-    left = 0
+    # all(val > x for val in a[i:hi]).
-    right = len(ranges) - 1
+    # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
-    while right >= left:
+    # breakpoints); we want to use `bisect_right` to look up the range that
-        mid = (left + right) >> 1
+    # contains the given codepoint: i.e. whose start is less than or equal
-        if code < ranges[mid][0]:
+    # to the codepoint. Thus, we subtract -1 from the index returned.
-            right = mid - 1
+    i = bisect_right(SCRIPT_RANGES, code)
-        elif code > ranges[mid][1]:
+    return SCRIPT_NAMES[i-1]
            left = mid + 1
        else:
            return ranges[mid][2]
    return default