Merge pull request #1150 from anthrotype/ot-tags-from-script

unicodedata: add ot_tags_from_script function
This commit is contained in:
Cosimo Lupo 2018-01-19 13:21:03 +01:00 committed by GitHub
commit 2ed59f20f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 75 additions and 1 deletions

View File

@ -0,0 +1,37 @@
# Data updated to OpenType 1.8.2 as of January 2018.
# Complete list of OpenType script tags at:
# https://www.microsoft.com/typography/otspec/scripttags.htm
# Most of the script tags are the same as the ISO 15924 tag but lowercased,
# so we only have to handle the exceptional cases:
# - KATAKANA and HIRAGANA both map to 'kana';
# - spaces at the end are preserved, unlike ISO 15924;
# - we map special script codes for Inherited, Common and Unknown to DFLT.
DEFAULT_SCRIPT = "DFLT"
SCRIPT_EXCEPTIONS = {
"Hira": "kana",
"Hrkt": "kana",
"Laoo": "lao ",
"Yiii": "yi ",
"Nkoo": "nko ",
"Vaii": "vai ",
"Zinh": DEFAULT_SCRIPT,
"Zyyy": DEFAULT_SCRIPT,
"Zzzz": DEFAULT_SCRIPT,
}
NEW_SCRIPT_TAGS = {
"Beng": ("bng2",),
"Deva": ("dev2",),
"Gujr": ("gjr2",),
"Guru": ("gur2",),
"Knda": ("knd2",),
"Mlym": ("mlm2",),
"Orya": ("ory2",),
"Taml": ("tml2",),
"Telu": ("tel2",),
"Mymr": ("mym2",),
}

View File

@ -13,7 +13,7 @@ except ImportError: # pragma: no cover
# fall back to built-in unicodedata (possibly outdated)
from unicodedata import *
from . import Blocks, Scripts, ScriptExtensions
from . import Blocks, Scripts, ScriptExtensions, OTTags
__all__ = [tostr(s) for s in (
@ -38,6 +38,7 @@ __all__ = [tostr(s) for s in (
"script_extension",
"script_name",
"script_code",
"ot_tags_from_script",
)]
@ -147,3 +148,24 @@ def block(char):
code = byteord(char)
i = bisect_right(Blocks.RANGES, code)
return Blocks.VALUES[i-1]
def ot_tags_from_script(script_code):
""" Return a list of OpenType script tags associated with a given
Unicode script code.
Return ['DFLT'] script tag for invalid/unknown script codes.
"""
if script_code not in Scripts.NAMES:
return [OTTags.DEFAULT_SCRIPT]
script_tags = [
OTTags.SCRIPT_EXCEPTIONS.get(
script_code,
script_code[0].lower() + script_code[1:]
)
]
if script_code in OTTags.NEW_SCRIPT_TAGS:
script_tags.extend(OTTags.NEW_SCRIPT_TAGS[script_code])
script_tags.reverse() # last in, first out
return script_tags

View File

@ -203,6 +203,21 @@ def test_block():
assert unicodedata.block("\u1c90") == "No_Block"
def test_ot_tags_from_script():
# simple
assert unicodedata.ot_tags_from_script("Latn") == ["latn"]
# script mapped to multiple new and old script tags
assert unicodedata.ot_tags_from_script("Deva") == ["dev2", "deva"]
# exceptions
assert unicodedata.ot_tags_from_script("Hira") == ["kana"]
# special script codes map to DFLT
assert unicodedata.ot_tags_from_script("Zinh") == ["DFLT"]
assert unicodedata.ot_tags_from_script("Zyyy") == ["DFLT"]
assert unicodedata.ot_tags_from_script("Zzzz") == ["DFLT"]
# this is invalid or unknown
assert unicodedata.ot_tags_from_script("Aaaa") == ["DFLT"]
if __name__ == "__main__":
import sys
sys.exit(pytest.main(sys.argv))