[unicodedata] add new module and 'script' function

The new `fontTools.unicodedata` module re-exports all the public
functions from the built-in `unicodedata` module, and also adds
additional functions.

The `script` function takes a unicode character and returns the
script name as defined in the UCD "Script.txt" data file.

It's implemented as a simple binary search, plus a memoizing
decorator that caches the results to avoid search the same
character more than once.

The unicodedata2 backport is imported if present, otherwise
the unicodedata built-in is used.
This commit is contained in:
Cosimo Lupo 2017-11-17 19:17:17 +00:00
parent 96dafe4afc
commit 52d6131525

View File

@ -0,0 +1,71 @@
from __future__ import (
print_function, division, absolute_import, unicode_literals)
from fontTools.misc.py23 import *
import functools
try:
# use unicodedata backport compatible with python2:
# https://github.com/mikekap/unicodedata2
from unicodedata2 import *
except ImportError:
# fall back to built-in unicodedata (possibly outdated)
from unicodedata import *
from .scripts import SCRIPT_RANGES
__all__ = [
# names from built-in unicodedata module
"lookup",
"name",
"decimal",
"digit",
"numeric",
"category",
"bidirectional",
"combining",
"east_asian_width",
"mirrored",
"decomposition",
"normalize",
"unidata_version",
"ucd_3_2_0",
# additonal functions
"script",
]
def _memoize(func):
# Decorator that caches a function's return value each time it is
# called, and returns the cached value if called later with the same
# argument.
cache = func.cache = {}
@functools.wraps(func)
def wrapper(arg):
if arg not in cache:
cache[arg] = func(arg)
return cache[arg]
return wrapper
@_memoize
def script(char):
"""For the unicode character 'char' return the script name."""
code = byteord(char)
return _binary_search_range(code, SCRIPT_RANGES, default="Unknown")
def _binary_search_range(code, ranges, default=None):
left = 0
right = len(ranges) - 1
while right >= left:
mid = (left + right) >> 1
if code < ranges[mid][0]:
right = mid - 1
elif code > ranges[mid][1]:
left = mid + 1
else:
return ranges[mid][2]
return default