[unicodedata] add new module and 'script' function
The new `fontTools.unicodedata` module re-exports all the public functions from the built-in `unicodedata` module, and also adds additional functions. The `script` function takes a unicode character and returns the script name as defined in the UCD "Script.txt" data file. It's implemented as a simple binary search, plus a memoizing decorator that caches the results to avoid search the same character more than once. The unicodedata2 backport is imported if present, otherwise the unicodedata built-in is used.
This commit is contained in:
parent
96dafe4afc
commit
52d6131525
71
Lib/fontTools/unicodedata/__init__.py
Normal file
71
Lib/fontTools/unicodedata/__init__.py
Normal file
@ -0,0 +1,71 @@
|
||||
from __future__ import (
|
||||
print_function, division, absolute_import, unicode_literals)
|
||||
from fontTools.misc.py23 import *
|
||||
|
||||
import functools
|
||||
|
||||
try:
|
||||
# use unicodedata backport compatible with python2:
|
||||
# https://github.com/mikekap/unicodedata2
|
||||
from unicodedata2 import *
|
||||
except ImportError:
|
||||
# fall back to built-in unicodedata (possibly outdated)
|
||||
from unicodedata import *
|
||||
|
||||
from .scripts import SCRIPT_RANGES
|
||||
|
||||
|
||||
__all__ = [
|
||||
# names from built-in unicodedata module
|
||||
"lookup",
|
||||
"name",
|
||||
"decimal",
|
||||
"digit",
|
||||
"numeric",
|
||||
"category",
|
||||
"bidirectional",
|
||||
"combining",
|
||||
"east_asian_width",
|
||||
"mirrored",
|
||||
"decomposition",
|
||||
"normalize",
|
||||
"unidata_version",
|
||||
"ucd_3_2_0",
|
||||
# additonal functions
|
||||
"script",
|
||||
]
|
||||
|
||||
|
||||
def _memoize(func):
|
||||
# Decorator that caches a function's return value each time it is
|
||||
# called, and returns the cached value if called later with the same
|
||||
# argument.
|
||||
cache = func.cache = {}
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(arg):
|
||||
if arg not in cache:
|
||||
cache[arg] = func(arg)
|
||||
return cache[arg]
|
||||
return wrapper
|
||||
|
||||
|
||||
@_memoize
|
||||
def script(char):
|
||||
"""For the unicode character 'char' return the script name."""
|
||||
code = byteord(char)
|
||||
return _binary_search_range(code, SCRIPT_RANGES, default="Unknown")
|
||||
|
||||
|
||||
def _binary_search_range(code, ranges, default=None):
|
||||
left = 0
|
||||
right = len(ranges) - 1
|
||||
while right >= left:
|
||||
mid = (left + right) >> 1
|
||||
if code < ranges[mid][0]:
|
||||
right = mid - 1
|
||||
elif code > ranges[mid][1]:
|
||||
left = mid + 1
|
||||
else:
|
||||
return ranges[mid][2]
|
||||
return default
|
Loading…
x
Reference in New Issue
Block a user