unicodedata: add ot_tag_to_script function

returns the Unicode script code for a given OpenType script tag, or None if no match is found
This commit is contained in:
Cosimo Lupo 2018-01-23 11:45:20 -08:00
parent 29deb7e6fb
commit 677954d5b9
No known key found for this signature in database
GPG Key ID: 59D54DB0C9976482
3 changed files with 59 additions and 0 deletions

View File

@ -35,3 +35,7 @@ NEW_SCRIPT_TAGS = {
"Telu": ("tel2",), "Telu": ("tel2",),
"Mymr": ("mym2",), "Mymr": ("mym2",),
} }
NEW_SCRIPT_TAGS_REVERSED = {
value: key for key, values in NEW_SCRIPT_TAGS.items() for value in values
}

View File

@ -40,6 +40,7 @@ __all__ = [tostr(s) for s in (
"script_code", "script_code",
"script_horizontal_direction", "script_horizontal_direction",
"ot_tags_from_script", "ot_tags_from_script",
"ot_tag_to_script",
)] )]
@ -239,3 +240,37 @@ def ot_tags_from_script(script_code):
script_tags.reverse() # last in, first out script_tags.reverse() # last in, first out
return script_tags return script_tags
def ot_tag_to_script(tag):
""" Return the Unicode script code for the given OpenType script tag, or
None for "DFLT" tag or if there is no Unicode script associated with it.
Raises ValueError if the tag is invalid.
"""
tag = tostr(tag).strip()
if not tag or " " in tag or len(tag) > 4:
raise ValueError("invalid OpenType tag: %r" % tag)
while len(tag) != 4:
tag += str(" ") # pad with spaces
if tag == OTTags.DEFAULT_SCRIPT:
# it's unclear which Unicode script the "DFLT" OpenType tag maps to,
# so here we return None
return None
if tag in OTTags.NEW_SCRIPT_TAGS_REVERSED:
return OTTags.NEW_SCRIPT_TAGS_REVERSED[tag]
# This side of the conversion is fully algorithmic
# Any spaces at the end of the tag are replaced by repeating the last
# letter. Eg 'nko ' -> 'Nkoo'.
# Change first char to uppercase
script_code = tag[0].upper() + tag[1]
for i in range(2, 4):
script_code += (script_code[i-1] if tag[i] == " " else tag[i])
if script_code not in Scripts.NAMES:
return None
return script_code

View File

@ -218,6 +218,26 @@ def test_ot_tags_from_script():
assert unicodedata.ot_tags_from_script("Aaaa") == ["DFLT"] assert unicodedata.ot_tags_from_script("Aaaa") == ["DFLT"]
def test_ot_tag_to_script():
assert unicodedata.ot_tag_to_script("latn") == "Latn"
assert unicodedata.ot_tag_to_script("kana") == "Kana"
assert unicodedata.ot_tag_to_script("DFLT") == None
assert unicodedata.ot_tag_to_script("aaaa") == None
assert unicodedata.ot_tag_to_script("beng") == "Beng"
assert unicodedata.ot_tag_to_script("bng2") == "Beng"
assert unicodedata.ot_tag_to_script("dev2") == "Deva"
assert unicodedata.ot_tag_to_script("gjr2") == "Gujr"
assert unicodedata.ot_tag_to_script("yi ") == "Yiii"
assert unicodedata.ot_tag_to_script("nko ") == "Nkoo"
assert unicodedata.ot_tag_to_script("vai ") == "Vaii"
assert unicodedata.ot_tag_to_script("lao ") == "Laoo"
assert unicodedata.ot_tag_to_script("yi") == "Yiii"
for invalid_value in ("", " ", "z zz", "zzzzz"):
with pytest.raises(ValueError, match="invalid OpenType tag"):
unicodedata.ot_tag_to_script(invalid_value)
def test_script_horizontal_direction(): def test_script_horizontal_direction():
assert unicodedata.script_horizontal_direction("Latn") == "LTR" assert unicodedata.script_horizontal_direction("Latn") == "LTR"
assert unicodedata.script_horizontal_direction("Arab") == "RTL" assert unicodedata.script_horizontal_direction("Arab") == "RTL"