From 697b8d9af534c34a1febd3c6e3807a2abcbfb078 Mon Sep 17 00:00:00 2001 From: Cosimo Lupo Date: Mon, 20 Nov 2017 18:16:02 +0100 Subject: [PATCH] [unicodedata] add block and script_extension functions --- Lib/fontTools/unicodedata/__init__.py | 57 +++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/Lib/fontTools/unicodedata/__init__.py b/Lib/fontTools/unicodedata/__init__.py index f0c22e4d9..4c6e346aa 100644 --- a/Lib/fontTools/unicodedata/__init__.py +++ b/Lib/fontTools/unicodedata/__init__.py @@ -8,11 +8,11 @@ try: # use unicodedata backport compatible with python2: # https://github.com/mikekap/unicodedata2 from unicodedata2 import * -except ImportError: +except ImportError: # pragma: no cover # fall back to built-in unicodedata (possibly outdated) from unicodedata import * -from .scripts import SCRIPT_RANGES, SCRIPT_NAMES +from . import Blocks, Scripts, ScriptExtensions __all__ = [ @@ -32,11 +32,23 @@ __all__ = [ "unidata_version", "ucd_3_2_0", # additonal functions + "block", "script", + "script_extension", ] def script(char): + """ Return the script property assigned to the Unicode character 'char' + as string. + + >>> script("a") + 'Latin' + >>> script(",") + 'Common' + >>> script(unichr(0x10FFFF)) + 'Unknown' + """ code = byteord(char) # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which # comes after (to the right of) any existing entries of x in a, and it @@ -47,5 +59,42 @@ def script(char): # breakpoints); we want to use `bisect_right` to look up the range that # contains the given codepoint: i.e. whose start is less than or equal # to the codepoint. Thus, we subtract -1 from the index returned. - i = bisect_right(SCRIPT_RANGES, code) - return SCRIPT_NAMES[i-1] + i = bisect_right(Scripts.RANGES, code) + return Scripts.VALUES[i-1] + + +def script_extension(char): + """ Return the script extension property assigned to the Unicode character + 'char' as a set of string. + + >>> script_extension("a") == {'Latin'} + True + >>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'} + True + >>> script_extension(unichr(0x10FFFF)) == {'Unknown'} + True + """ + code = byteord(char) + i = bisect_right(ScriptExtensions.RANGES, code) + value = ScriptExtensions.VALUES[i-1] + if value is None: + # code points not explicitly listed for Script Extensions + # have as their value the corresponding Script property value + return {script(char)} + return value + + +def block(char): + """ Return the block property assigned to the Unicode character 'char' + as a string. + + >>> block("a") + 'Basic Latin' + >>> block(unichr(0x060C)) + 'Arabic' + >>> block(unichr(0xEFFFF)) + 'No_Block' + """ + code = byteord(char) + i = bisect_right(Blocks.RANGES, code) + return Blocks.VALUES[i-1]