from __future__ import ( print_function, division, absolute_import, unicode_literals) from fontTools.misc.py23 import * import re from bisect import bisect_right try: # use unicodedata backport compatible with python2: # https://github.com/mikekap/unicodedata2 from unicodedata2 import * except ImportError: # pragma: no cover # fall back to built-in unicodedata (possibly outdated) from unicodedata import * from . import Blocks, Scripts, ScriptExtensions __all__ = [tostr(s) for s in ( # names from built-in unicodedata module "lookup", "name", "decimal", "digit", "numeric", "category", "bidirectional", "combining", "east_asian_width", "mirrored", "decomposition", "normalize", "unidata_version", "ucd_3_2_0", # additonal functions "block", "script", "script_extension", "script_name", "script_code", )] def script(char): """ Return the four-letter script code assigned to the Unicode character 'char' as string. >>> script("a") 'Latn' >>> script(",") 'Zyyy' >>> script(unichr(0x10FFFF)) 'Zzzz' """ code = byteord(char) # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which # comes after (to the right of) any existing entries of x in a, and it # partitions array a into two halves so that, for the left side # all(val <= x for val in a[lo:i]), and for the right side # all(val > x for val in a[i:hi]). # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting # breakpoints); we want to use `bisect_right` to look up the range that # contains the given codepoint: i.e. whose start is less than or equal # to the codepoint. Thus, we subtract -1 from the index returned. i = bisect_right(Scripts.RANGES, code) return Scripts.VALUES[i-1] def script_extension(char): """ Return the script extension property assigned to the Unicode character 'char' as a set of string. >>> script_extension("a") == {'Latn'} True >>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'} True >>> script_extension(unichr(0x10FFFF)) == {'Zzzz'} True """ code = byteord(char) i = bisect_right(ScriptExtensions.RANGES, code) value = ScriptExtensions.VALUES[i-1] if value is None: # code points not explicitly listed for Script Extensions # have as their value the corresponding Script property value return {script(char)} return value def script_name(code, default=KeyError): """ Return the long, human-readable script name given a four-letter Unicode script code. If no matching name is found, a KeyError is raised by default. You can use the 'default' argument to return a fallback value (e.g. 'Unknown' or None) instead of throwing an error. """ try: return str(Scripts.NAMES[code].replace("_", " ")) except KeyError: if isinstance(default, type) and issubclass(default, KeyError): raise return default _normalize_re = re.compile(r"[-_ ]+") def _normalize_property_name(string): """Remove case, strip space, '-' and '_' for loose matching.""" return _normalize_re.sub("", string).lower() _SCRIPT_CODES = {_normalize_property_name(v): k for k, v in Scripts.NAMES.items()} def script_code(script_name, default=KeyError): """Returns the four-letter Unicode script code from its long name If no matching script code is found, a KeyError is raised by default. You can use the 'default' argument to return a fallback string (e.g. 'Zzzz' or None) instead of throwing an error. """ normalized_name = _normalize_property_name(script_name) try: return _SCRIPT_CODES[normalized_name] except KeyError: if isinstance(default, type) and issubclass(default, KeyError): raise return default def block(char): """ Return the block property assigned to the Unicode character 'char' as a string. >>> block("a") 'Basic Latin' >>> block(unichr(0x060C)) 'Arabic' >>> block(unichr(0xEFFFF)) 'No_Block' """ code = byteord(char) i = bisect_right(Blocks.RANGES, code) return Blocks.VALUES[i-1]