From 52d61315254223b5a19551985c84211a4868d841 Mon Sep 17 00:00:00 2001 From: Cosimo Lupo Date: Fri, 17 Nov 2017 19:17:17 +0000 Subject: [PATCH] [unicodedata] add new module and 'script' function The new `fontTools.unicodedata` module re-exports all the public functions from the built-in `unicodedata` module, and also adds additional functions. The `script` function takes a unicode character and returns the script name as defined in the UCD "Script.txt" data file. It's implemented as a simple binary search, plus a memoizing decorator that caches the results to avoid search the same character more than once. The unicodedata2 backport is imported if present, otherwise the unicodedata built-in is used. --- Lib/fontTools/unicodedata/__init__.py | 71 +++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 Lib/fontTools/unicodedata/__init__.py diff --git a/Lib/fontTools/unicodedata/__init__.py b/Lib/fontTools/unicodedata/__init__.py new file mode 100644 index 000000000..e5d3189ee --- /dev/null +++ b/Lib/fontTools/unicodedata/__init__.py @@ -0,0 +1,71 @@ +from __future__ import ( + print_function, division, absolute_import, unicode_literals) +from fontTools.misc.py23 import * + +import functools + +try: + # use unicodedata backport compatible with python2: + # https://github.com/mikekap/unicodedata2 + from unicodedata2 import * +except ImportError: + # fall back to built-in unicodedata (possibly outdated) + from unicodedata import * + +from .scripts import SCRIPT_RANGES + + +__all__ = [ + # names from built-in unicodedata module + "lookup", + "name", + "decimal", + "digit", + "numeric", + "category", + "bidirectional", + "combining", + "east_asian_width", + "mirrored", + "decomposition", + "normalize", + "unidata_version", + "ucd_3_2_0", + # additonal functions + "script", +] + + +def _memoize(func): + # Decorator that caches a function's return value each time it is + # called, and returns the cached value if called later with the same + # argument. + cache = func.cache = {} + + @functools.wraps(func) + def wrapper(arg): + if arg not in cache: + cache[arg] = func(arg) + return cache[arg] + return wrapper + + +@_memoize +def script(char): + """For the unicode character 'char' return the script name.""" + code = byteord(char) + return _binary_search_range(code, SCRIPT_RANGES, default="Unknown") + + +def _binary_search_range(code, ranges, default=None): + left = 0 + right = len(ranges) - 1 + while right >= left: + mid = (left + right) >> 1 + if code < ranges[mid][0]: + right = mid - 1 + elif code > ranges[mid][1]: + left = mid + 1 + else: + return ranges[mid][2] + return default