unicodedata: add script_horizontal_direction function

same as harfbuzz hb_script_get_horizontal_direction.

We just hard-code the set of RTL script here, as it doesn't change often anyway.
The function is just syntactic sugar as it all does is basically looking up the
constant RTL_SCRIPTS set.
It's nice to have it here in a central place alongside 'script', 'script_name', etc.
This commit is contained in:
Cosimo Lupo 2018-01-19 18:04:33 +00:00
parent 38b25e00eb
commit 91a8cc33e7
No known key found for this signature in database
GPG Key ID: 59D54DB0C9976482
2 changed files with 81 additions and 0 deletions

View File

@ -38,6 +38,7 @@ __all__ = [tostr(s) for s in (
"script_extension",
"script_name",
"script_code",
"script_horizontal_direction",
"ot_tags_from_script",
)]
@ -134,6 +135,75 @@ def script_code(script_name, default=KeyError):
return default
# The data on script direction is taken from harfbuzz's "hb-common.cc":
# https://goo.gl/X5FDXC
# It matches the CLDR "scriptMetadata.txt as of January 2018:
# http://unicode.org/repos/cldr/trunk/common/properties/scriptMetadata.txt
RTL_SCRIPTS = {
# Unicode-1.1 additions
'Arab', # Arabic
'Hebr', # Hebrew
# Unicode-3.0 additions
'Syrc', # Syriac
'Thaa', # Thaana
# Unicode-4.0 additions
'Cprt', # Cypriot
# Unicode-4.1 additions
'Khar', # Kharoshthi
# Unicode-5.0 additions
'Phnx', # Phoenician
'Nkoo', # Nko
# Unicode-5.1 additions
'Lydi', # Lydian
# Unicode-5.2 additions
'Avst', # Avestan
'Armi', # Imperial Aramaic
'Phli', # Inscriptional Pahlavi
'Prti', # Inscriptional Parthian
'Sarb', # Old South Arabian
'Orkh', # Old Turkic
'Samr', # Samaritan
# Unicode-6.0 additions
'Mand', # Mandaic
# Unicode-6.1 additions
'Merc', # Meroitic Cursive
'Mero', # Meroitic Hieroglyphs
# Unicode-7.0 additions
'Mani', # Manichaean
'Mend', # Mende Kikakui
'Nbat', # Nabataean
'Narb', # Old North Arabian
'Palm', # Palmyrene
'Phlp', # Psalter Pahlavi
# Unicode-8.0 additions
'Hatr', # Hatran
'Hung', # Old Hungarian
# Unicode-9.0 additions
'Adlm', # Adlam
}
def script_horizontal_direction(script_code, default=KeyError):
""" Return "RTL" for scripts that contain right-to-left characters
according to the Bidi_Class property. Otherwise return "LTR".
"""
if script_code not in Scripts.NAMES:
if isinstance(default, type) and issubclass(default, KeyError):
raise default(script_code)
return default
return str("RTL") if script_code in RTL_SCRIPTS else str("LTR")
def block(char):
""" Return the block property assigned to the Unicode character 'char'
as a string.

View File

@ -218,6 +218,17 @@ def test_ot_tags_from_script():
assert unicodedata.ot_tags_from_script("Aaaa") == ["DFLT"]
def test_script_horizontal_direction():
assert unicodedata.script_horizontal_direction("Latn") == "LTR"
assert unicodedata.script_horizontal_direction("Arab") == "RTL"
assert unicodedata.script_horizontal_direction("Thaa") == "RTL"
with pytest.raises(KeyError):
unicodedata.script_horizontal_direction("Azzz")
assert unicodedata.script_horizontal_direction("Azzz",
default="LTR") == "LTR"
if __name__ == "__main__":
import sys
sys.exit(pytest.main(sys.argv))