Merge pull request #1111 from anthrotype/script-aliases
[unicodedata] return short codes; and functions to lookup aliases
This commit is contained in:
commit
b354d2ee30
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,7 @@ from __future__ import (
|
|||||||
print_function, division, absolute_import, unicode_literals)
|
print_function, division, absolute_import, unicode_literals)
|
||||||
from fontTools.misc.py23 import *
|
from fontTools.misc.py23 import *
|
||||||
|
|
||||||
|
import re
|
||||||
from bisect import bisect_right
|
from bisect import bisect_right
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -39,15 +40,15 @@ __all__ = [
|
|||||||
|
|
||||||
|
|
||||||
def script(char):
|
def script(char):
|
||||||
""" Return the script property assigned to the Unicode character 'char'
|
""" Return the four-letter script code assigned to the Unicode character
|
||||||
as string.
|
'char' as string.
|
||||||
|
|
||||||
>>> script("a")
|
>>> script("a")
|
||||||
'Latin'
|
'Latn'
|
||||||
>>> script(",")
|
>>> script(",")
|
||||||
'Common'
|
'Zyyy'
|
||||||
>>> script(unichr(0x10FFFF))
|
>>> script(unichr(0x10FFFF))
|
||||||
'Unknown'
|
'Zzzz'
|
||||||
"""
|
"""
|
||||||
code = byteord(char)
|
code = byteord(char)
|
||||||
# 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
|
# 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
|
||||||
@ -67,11 +68,11 @@ def script_extension(char):
|
|||||||
""" Return the script extension property assigned to the Unicode character
|
""" Return the script extension property assigned to the Unicode character
|
||||||
'char' as a set of string.
|
'char' as a set of string.
|
||||||
|
|
||||||
>>> script_extension("a") == {'Latin'}
|
>>> script_extension("a") == {'Latn'}
|
||||||
True
|
True
|
||||||
>>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
|
>>> script_extension(unichr(0x060C)) == {'Arab', 'Syrc', 'Thaa'}
|
||||||
True
|
True
|
||||||
>>> script_extension(unichr(0x10FFFF)) == {'Unknown'}
|
>>> script_extension(unichr(0x10FFFF)) == {'Zzzz'}
|
||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
code = byteord(char)
|
code = byteord(char)
|
||||||
@ -84,6 +85,52 @@ def script_extension(char):
|
|||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def script_name(code, default=KeyError):
|
||||||
|
""" Return the long, human-readable script name given a four-letter
|
||||||
|
Unicode script code.
|
||||||
|
|
||||||
|
If no matching name is found, a KeyError is raised by default.
|
||||||
|
|
||||||
|
You can use the 'default' argument to return a fallback value (e.g.
|
||||||
|
'Unknown' or None) instead of throwing an error.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return str(Scripts.NAMES[code].replace("_", " "))
|
||||||
|
except KeyError:
|
||||||
|
if isinstance(default, type) and issubclass(default, KeyError):
|
||||||
|
raise
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
_normalize_re = re.compile(r"[-_ ]+")
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_property_name(string):
|
||||||
|
"""Remove case, strip space, '-' and '_' for loose matching."""
|
||||||
|
return _normalize_re.sub("", string).lower()
|
||||||
|
|
||||||
|
|
||||||
|
_SCRIPT_CODES = {_normalize_property_name(v): k
|
||||||
|
for k, v in Scripts.NAMES.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def script_code(script_name, default=KeyError):
|
||||||
|
"""Returns the four-letter Unicode script code from its long name
|
||||||
|
|
||||||
|
If no matching script code is found, a KeyError is raised by default.
|
||||||
|
|
||||||
|
You can use the 'default' argument to return a fallback string (e.g.
|
||||||
|
'Zzzz' or None) instead of throwing an error.
|
||||||
|
"""
|
||||||
|
normalized_name = _normalize_property_name(script_name)
|
||||||
|
try:
|
||||||
|
return _SCRIPT_CODES[normalized_name]
|
||||||
|
except KeyError:
|
||||||
|
if isinstance(default, type) and issubclass(default, KeyError):
|
||||||
|
raise
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
def block(char):
|
def block(char):
|
||||||
""" Return the block property assigned to the Unicode character 'char'
|
""" Return the block property assigned to the Unicode character 'char'
|
||||||
as a string.
|
as a string.
|
||||||
|
@ -133,17 +133,40 @@ def parse_range_properties(infile, default=None, is_set=False):
|
|||||||
return merged_ranges
|
return merged_ranges
|
||||||
|
|
||||||
|
|
||||||
|
def parse_semicolon_separated_data(infile):
|
||||||
|
"""Parse a Unicode data file where each line contains a lists of values
|
||||||
|
separated by a semicolon (e.g. "PropertyValueAliases.txt").
|
||||||
|
The number of the values on different lines may be different.
|
||||||
|
|
||||||
|
Returns a list of lists each containing the values as strings.
|
||||||
|
"""
|
||||||
|
data = []
|
||||||
|
for line in infile:
|
||||||
|
line = line.split('#', 1)[0].strip() # remove the comment
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
fields = [str(field.strip()) for field in line.split(';')]
|
||||||
|
data.append(fields)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def _set_repr(value):
|
def _set_repr(value):
|
||||||
return 'None' if value is None else "{{{}}}".format(
|
return 'None' if value is None else "{{{}}}".format(
|
||||||
", ".join(repr(v) for v in sorted(value)))
|
", ".join(repr(v) for v in sorted(value)))
|
||||||
|
|
||||||
|
|
||||||
def build_ranges(filename, local_ucd=None, output_path=None,
|
def build_ranges(filename, local_ucd=None, output_path=None,
|
||||||
default=None, is_set=False):
|
default=None, is_set=False, aliases=None):
|
||||||
"""Fetch 'filename' UCD data file from Unicode official website, parse
|
"""Fetch 'filename' UCD data file from Unicode official website, parse
|
||||||
the ranges and properties and write them as two Python lists
|
the property ranges and values and write them as two Python lists
|
||||||
to 'fontTools.unicodedata.<filename>.py'.
|
to 'fontTools.unicodedata.<filename>.py'.
|
||||||
|
|
||||||
|
'aliases' is an optional mapping of property codes (short names) to long
|
||||||
|
name aliases (list of strings, with the first item being the preferred
|
||||||
|
alias). When this is provided, the property values are written using the
|
||||||
|
short notation, and an additional 'NAMES' dict with the aliases is
|
||||||
|
written to the output module.
|
||||||
|
|
||||||
To load the data file from a local directory, you can use the
|
To load the data file from a local directory, you can use the
|
||||||
'local_ucd' argument.
|
'local_ucd' argument.
|
||||||
"""
|
"""
|
||||||
@ -162,7 +185,11 @@ def build_ranges(filename, local_ucd=None, output_path=None,
|
|||||||
header = parse_unidata_header(f)
|
header = parse_unidata_header(f)
|
||||||
ranges = parse_range_properties(f, default=default, is_set=is_set)
|
ranges = parse_range_properties(f, default=default, is_set=is_set)
|
||||||
|
|
||||||
max_value_length = min(56, max(len(repr(v)) for _, _, v in ranges))
|
if aliases:
|
||||||
|
reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
|
||||||
|
max_value_length = 6 # 4-letter tags plus two quotes for repr
|
||||||
|
else:
|
||||||
|
max_value_length = min(56, max(len(repr(v)) for _, _, v in ranges))
|
||||||
|
|
||||||
with open(output_path, "w", encoding="utf-8") as f:
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
f.write(SRC_ENCODING)
|
f.write(SRC_ENCODING)
|
||||||
@ -182,17 +209,63 @@ def build_ranges(filename, local_ucd=None, output_path=None,
|
|||||||
f.write("\n")
|
f.write("\n")
|
||||||
f.write("VALUES = [\n")
|
f.write("VALUES = [\n")
|
||||||
for first, last, value in ranges:
|
for first, last, value in ranges:
|
||||||
|
comment = "# {:0>4X}..{:0>4X}".format(first, last)
|
||||||
if is_set:
|
if is_set:
|
||||||
value_repr = "{},".format(_set_repr(value))
|
value_repr = "{},".format(_set_repr(value))
|
||||||
else:
|
else:
|
||||||
|
if aliases:
|
||||||
|
# append long name to comment and use the short code
|
||||||
|
comment += " ; {}".format(value)
|
||||||
|
value = reversed_aliases[normalize(value)]
|
||||||
value_repr = "{!r},".format(value)
|
value_repr = "{!r},".format(value)
|
||||||
f.write(" {} # {:0>4X}..{:0>4X}\n".format(
|
f.write(" {} {}\n".format(
|
||||||
value_repr.ljust(max_value_length+1), first, last))
|
value_repr.ljust(max_value_length+1), comment))
|
||||||
f.write("]\n")
|
f.write("]\n")
|
||||||
|
|
||||||
|
if aliases:
|
||||||
|
f.write("\n")
|
||||||
|
f.write("NAMES = {\n")
|
||||||
|
for value, names in sorted(aliases.items()):
|
||||||
|
# we only write the first preferred alias
|
||||||
|
f.write(" {!r}: {!r},\n".format(value, names[0]))
|
||||||
|
f.write("}\n")
|
||||||
|
|
||||||
log.info("saved new file: '%s'", os.path.normpath(output_path))
|
log.info("saved new file: '%s'", os.path.normpath(output_path))
|
||||||
|
|
||||||
|
|
||||||
|
_normalize_re = re.compile(r"[-_ ]+")
|
||||||
|
|
||||||
|
def normalize(string):
|
||||||
|
"""Remove case, strip space, '-' and '_' for loose matching."""
|
||||||
|
return _normalize_re.sub("", string).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_property_value_aliases(property_tag, local_ucd=None):
|
||||||
|
"""Fetch the current 'PropertyValueAliases.txt' from the Unicode website,
|
||||||
|
parse the values for the specified 'property_tag' and return a dictionary
|
||||||
|
of name aliases (list of strings) keyed by short value codes (strings).
|
||||||
|
|
||||||
|
To load the data file from a local directory, you can use the
|
||||||
|
'local_ucd' argument.
|
||||||
|
"""
|
||||||
|
filename = "PropertyValueAliases.txt"
|
||||||
|
if local_ucd:
|
||||||
|
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||||
|
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||||
|
else:
|
||||||
|
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||||
|
cm = open_unidata_file(filename)
|
||||||
|
|
||||||
|
with cm as f:
|
||||||
|
header = parse_unidata_header(f)
|
||||||
|
data = parse_semicolon_separated_data(f)
|
||||||
|
|
||||||
|
aliases = {item[1]: item[2:] for item in data
|
||||||
|
if item[0] == property_tag}
|
||||||
|
|
||||||
|
return aliases
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
@ -207,7 +280,10 @@ def main():
|
|||||||
logging.basicConfig(level=level, format="%(message)s")
|
logging.basicConfig(level=level, format="%(message)s")
|
||||||
|
|
||||||
build_ranges("Blocks.txt", local_ucd=options.ucd_path, default="No_Block")
|
build_ranges("Blocks.txt", local_ucd=options.ucd_path, default="No_Block")
|
||||||
build_ranges("Scripts.txt", local_ucd=options.ucd_path, default="Unknown")
|
|
||||||
|
script_aliases = parse_property_value_aliases("sc", options.ucd_path)
|
||||||
|
build_ranges("Scripts.txt", local_ucd=options.ucd_path, default="Unknown",
|
||||||
|
aliases=script_aliases)
|
||||||
build_ranges("ScriptExtensions.txt", local_ucd=options.ucd_path,
|
build_ranges("ScriptExtensions.txt", local_ucd=options.ucd_path,
|
||||||
is_set=True)
|
is_set=True)
|
||||||
|
|
||||||
|
@ -4,162 +4,164 @@ from fontTools.misc.py23 import *
|
|||||||
|
|
||||||
from fontTools import unicodedata
|
from fontTools import unicodedata
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_script():
|
def test_script():
|
||||||
assert unicodedata.script("a") == "Latin"
|
assert unicodedata.script("a") == "Latn"
|
||||||
assert unicodedata.script(unichr(0)) == "Common"
|
assert unicodedata.script(unichr(0)) == "Zyyy"
|
||||||
assert unicodedata.script(unichr(0x0378)) == "Unknown"
|
assert unicodedata.script(unichr(0x0378)) == "Zzzz"
|
||||||
assert unicodedata.script(unichr(0x10FFFF)) == "Unknown"
|
assert unicodedata.script(unichr(0x10FFFF)) == "Zzzz"
|
||||||
|
|
||||||
# these were randomly sampled, one character per script
|
# these were randomly sampled, one character per script
|
||||||
assert unicodedata.script(unichr(0x1E918)) == 'Adlam'
|
assert unicodedata.script(unichr(0x1E918)) == 'Adlm'
|
||||||
assert unicodedata.script(unichr(0x1170D)) == 'Ahom'
|
assert unicodedata.script(unichr(0x1170D)) == 'Ahom'
|
||||||
assert unicodedata.script(unichr(0x145A0)) == 'Anatolian_Hieroglyphs'
|
assert unicodedata.script(unichr(0x145A0)) == 'Hluw'
|
||||||
assert unicodedata.script(unichr(0x0607)) == 'Arabic'
|
assert unicodedata.script(unichr(0x0607)) == 'Arab'
|
||||||
assert unicodedata.script(unichr(0x056C)) == 'Armenian'
|
assert unicodedata.script(unichr(0x056C)) == 'Armn'
|
||||||
assert unicodedata.script(unichr(0x10B27)) == 'Avestan'
|
assert unicodedata.script(unichr(0x10B27)) == 'Avst'
|
||||||
assert unicodedata.script(unichr(0x1B41)) == 'Balinese'
|
assert unicodedata.script(unichr(0x1B41)) == 'Bali'
|
||||||
assert unicodedata.script(unichr(0x168AD)) == 'Bamum'
|
assert unicodedata.script(unichr(0x168AD)) == 'Bamu'
|
||||||
assert unicodedata.script(unichr(0x16ADD)) == 'Bassa_Vah'
|
assert unicodedata.script(unichr(0x16ADD)) == 'Bass'
|
||||||
assert unicodedata.script(unichr(0x1BE5)) == 'Batak'
|
assert unicodedata.script(unichr(0x1BE5)) == 'Batk'
|
||||||
assert unicodedata.script(unichr(0x09F3)) == 'Bengali'
|
assert unicodedata.script(unichr(0x09F3)) == 'Beng'
|
||||||
assert unicodedata.script(unichr(0x11C5B)) == 'Bhaiksuki'
|
assert unicodedata.script(unichr(0x11C5B)) == 'Bhks'
|
||||||
assert unicodedata.script(unichr(0x3126)) == 'Bopomofo'
|
assert unicodedata.script(unichr(0x3126)) == 'Bopo'
|
||||||
assert unicodedata.script(unichr(0x1103B)) == 'Brahmi'
|
assert unicodedata.script(unichr(0x1103B)) == 'Brah'
|
||||||
assert unicodedata.script(unichr(0x2849)) == 'Braille'
|
assert unicodedata.script(unichr(0x2849)) == 'Brai'
|
||||||
assert unicodedata.script(unichr(0x1A0A)) == 'Buginese'
|
assert unicodedata.script(unichr(0x1A0A)) == 'Bugi'
|
||||||
assert unicodedata.script(unichr(0x174E)) == 'Buhid'
|
assert unicodedata.script(unichr(0x174E)) == 'Buhd'
|
||||||
assert unicodedata.script(unichr(0x18EE)) == 'Canadian_Aboriginal'
|
assert unicodedata.script(unichr(0x18EE)) == 'Cans'
|
||||||
assert unicodedata.script(unichr(0x102B7)) == 'Carian'
|
assert unicodedata.script(unichr(0x102B7)) == 'Cari'
|
||||||
assert unicodedata.script(unichr(0x1053D)) == 'Caucasian_Albanian'
|
assert unicodedata.script(unichr(0x1053D)) == 'Aghb'
|
||||||
assert unicodedata.script(unichr(0x11123)) == 'Chakma'
|
assert unicodedata.script(unichr(0x11123)) == 'Cakm'
|
||||||
assert unicodedata.script(unichr(0xAA1F)) == 'Cham'
|
assert unicodedata.script(unichr(0xAA1F)) == 'Cham'
|
||||||
assert unicodedata.script(unichr(0xAB95)) == 'Cherokee'
|
assert unicodedata.script(unichr(0xAB95)) == 'Cher'
|
||||||
assert unicodedata.script(unichr(0x1F0C7)) == 'Common'
|
assert unicodedata.script(unichr(0x1F0C7)) == 'Zyyy'
|
||||||
assert unicodedata.script(unichr(0x2C85)) == 'Coptic'
|
assert unicodedata.script(unichr(0x2C85)) == 'Copt'
|
||||||
assert unicodedata.script(unichr(0x12014)) == 'Cuneiform'
|
assert unicodedata.script(unichr(0x12014)) == 'Xsux'
|
||||||
assert unicodedata.script(unichr(0x1082E)) == 'Cypriot'
|
assert unicodedata.script(unichr(0x1082E)) == 'Cprt'
|
||||||
assert unicodedata.script(unichr(0xA686)) == 'Cyrillic'
|
assert unicodedata.script(unichr(0xA686)) == 'Cyrl'
|
||||||
assert unicodedata.script(unichr(0x10417)) == 'Deseret'
|
assert unicodedata.script(unichr(0x10417)) == 'Dsrt'
|
||||||
assert unicodedata.script(unichr(0x093E)) == 'Devanagari'
|
assert unicodedata.script(unichr(0x093E)) == 'Deva'
|
||||||
assert unicodedata.script(unichr(0x1BC4B)) == 'Duployan'
|
assert unicodedata.script(unichr(0x1BC4B)) == 'Dupl'
|
||||||
assert unicodedata.script(unichr(0x1310C)) == 'Egyptian_Hieroglyphs'
|
assert unicodedata.script(unichr(0x1310C)) == 'Egyp'
|
||||||
assert unicodedata.script(unichr(0x1051C)) == 'Elbasan'
|
assert unicodedata.script(unichr(0x1051C)) == 'Elba'
|
||||||
assert unicodedata.script(unichr(0x2DA6)) == 'Ethiopic'
|
assert unicodedata.script(unichr(0x2DA6)) == 'Ethi'
|
||||||
assert unicodedata.script(unichr(0x10AD)) == 'Georgian'
|
assert unicodedata.script(unichr(0x10AD)) == 'Geor'
|
||||||
assert unicodedata.script(unichr(0x2C52)) == 'Glagolitic'
|
assert unicodedata.script(unichr(0x2C52)) == 'Glag'
|
||||||
assert unicodedata.script(unichr(0x10343)) == 'Gothic'
|
assert unicodedata.script(unichr(0x10343)) == 'Goth'
|
||||||
assert unicodedata.script(unichr(0x11371)) == 'Grantha'
|
assert unicodedata.script(unichr(0x11371)) == 'Gran'
|
||||||
assert unicodedata.script(unichr(0x03D0)) == 'Greek'
|
assert unicodedata.script(unichr(0x03D0)) == 'Grek'
|
||||||
assert unicodedata.script(unichr(0x0AAA)) == 'Gujarati'
|
assert unicodedata.script(unichr(0x0AAA)) == 'Gujr'
|
||||||
assert unicodedata.script(unichr(0x0A4C)) == 'Gurmukhi'
|
assert unicodedata.script(unichr(0x0A4C)) == 'Guru'
|
||||||
assert unicodedata.script(unichr(0x23C9F)) == 'Han'
|
assert unicodedata.script(unichr(0x23C9F)) == 'Hani'
|
||||||
assert unicodedata.script(unichr(0xC259)) == 'Hangul'
|
assert unicodedata.script(unichr(0xC259)) == 'Hang'
|
||||||
assert unicodedata.script(unichr(0x1722)) == 'Hanunoo'
|
assert unicodedata.script(unichr(0x1722)) == 'Hano'
|
||||||
assert unicodedata.script(unichr(0x108F5)) == 'Hatran'
|
assert unicodedata.script(unichr(0x108F5)) == 'Hatr'
|
||||||
assert unicodedata.script(unichr(0x05C2)) == 'Hebrew'
|
assert unicodedata.script(unichr(0x05C2)) == 'Hebr'
|
||||||
assert unicodedata.script(unichr(0x1B072)) == 'Hiragana'
|
assert unicodedata.script(unichr(0x1B072)) == 'Hira'
|
||||||
assert unicodedata.script(unichr(0x10847)) == 'Imperial_Aramaic'
|
assert unicodedata.script(unichr(0x10847)) == 'Armi'
|
||||||
assert unicodedata.script(unichr(0x033A)) == 'Inherited'
|
assert unicodedata.script(unichr(0x033A)) == 'Zinh'
|
||||||
assert unicodedata.script(unichr(0x10B66)) == 'Inscriptional_Pahlavi'
|
assert unicodedata.script(unichr(0x10B66)) == 'Phli'
|
||||||
assert unicodedata.script(unichr(0x10B4B)) == 'Inscriptional_Parthian'
|
assert unicodedata.script(unichr(0x10B4B)) == 'Prti'
|
||||||
assert unicodedata.script(unichr(0xA98A)) == 'Javanese'
|
assert unicodedata.script(unichr(0xA98A)) == 'Java'
|
||||||
assert unicodedata.script(unichr(0x110B2)) == 'Kaithi'
|
assert unicodedata.script(unichr(0x110B2)) == 'Kthi'
|
||||||
assert unicodedata.script(unichr(0x0CC6)) == 'Kannada'
|
assert unicodedata.script(unichr(0x0CC6)) == 'Knda'
|
||||||
assert unicodedata.script(unichr(0x3337)) == 'Katakana'
|
assert unicodedata.script(unichr(0x3337)) == 'Kana'
|
||||||
assert unicodedata.script(unichr(0xA915)) == 'Kayah_Li'
|
assert unicodedata.script(unichr(0xA915)) == 'Kali'
|
||||||
assert unicodedata.script(unichr(0x10A2E)) == 'Kharoshthi'
|
assert unicodedata.script(unichr(0x10A2E)) == 'Khar'
|
||||||
assert unicodedata.script(unichr(0x17AA)) == 'Khmer'
|
assert unicodedata.script(unichr(0x17AA)) == 'Khmr'
|
||||||
assert unicodedata.script(unichr(0x11225)) == 'Khojki'
|
assert unicodedata.script(unichr(0x11225)) == 'Khoj'
|
||||||
assert unicodedata.script(unichr(0x112B6)) == 'Khudawadi'
|
assert unicodedata.script(unichr(0x112B6)) == 'Sind'
|
||||||
assert unicodedata.script(unichr(0x0ED7)) == 'Lao'
|
assert unicodedata.script(unichr(0x0ED7)) == 'Laoo'
|
||||||
assert unicodedata.script(unichr(0xAB3C)) == 'Latin'
|
assert unicodedata.script(unichr(0xAB3C)) == 'Latn'
|
||||||
assert unicodedata.script(unichr(0x1C48)) == 'Lepcha'
|
assert unicodedata.script(unichr(0x1C48)) == 'Lepc'
|
||||||
assert unicodedata.script(unichr(0x1923)) == 'Limbu'
|
assert unicodedata.script(unichr(0x1923)) == 'Limb'
|
||||||
assert unicodedata.script(unichr(0x1071D)) == 'Linear_A'
|
assert unicodedata.script(unichr(0x1071D)) == 'Lina'
|
||||||
assert unicodedata.script(unichr(0x100EC)) == 'Linear_B'
|
assert unicodedata.script(unichr(0x100EC)) == 'Linb'
|
||||||
assert unicodedata.script(unichr(0xA4E9)) == 'Lisu'
|
assert unicodedata.script(unichr(0xA4E9)) == 'Lisu'
|
||||||
assert unicodedata.script(unichr(0x10284)) == 'Lycian'
|
assert unicodedata.script(unichr(0x10284)) == 'Lyci'
|
||||||
assert unicodedata.script(unichr(0x10926)) == 'Lydian'
|
assert unicodedata.script(unichr(0x10926)) == 'Lydi'
|
||||||
assert unicodedata.script(unichr(0x11161)) == 'Mahajani'
|
assert unicodedata.script(unichr(0x11161)) == 'Mahj'
|
||||||
assert unicodedata.script(unichr(0x0D56)) == 'Malayalam'
|
assert unicodedata.script(unichr(0x0D56)) == 'Mlym'
|
||||||
assert unicodedata.script(unichr(0x0856)) == 'Mandaic'
|
assert unicodedata.script(unichr(0x0856)) == 'Mand'
|
||||||
assert unicodedata.script(unichr(0x10AF0)) == 'Manichaean'
|
assert unicodedata.script(unichr(0x10AF0)) == 'Mani'
|
||||||
assert unicodedata.script(unichr(0x11CB0)) == 'Marchen'
|
assert unicodedata.script(unichr(0x11CB0)) == 'Marc'
|
||||||
assert unicodedata.script(unichr(0x11D28)) == 'Masaram_Gondi'
|
assert unicodedata.script(unichr(0x11D28)) == 'Gonm'
|
||||||
assert unicodedata.script(unichr(0xABDD)) == 'Meetei_Mayek'
|
assert unicodedata.script(unichr(0xABDD)) == 'Mtei'
|
||||||
assert unicodedata.script(unichr(0x1E897)) == 'Mende_Kikakui'
|
assert unicodedata.script(unichr(0x1E897)) == 'Mend'
|
||||||
assert unicodedata.script(unichr(0x109B0)) == 'Meroitic_Cursive'
|
assert unicodedata.script(unichr(0x109B0)) == 'Merc'
|
||||||
assert unicodedata.script(unichr(0x10993)) == 'Meroitic_Hieroglyphs'
|
assert unicodedata.script(unichr(0x10993)) == 'Mero'
|
||||||
assert unicodedata.script(unichr(0x16F5D)) == 'Miao'
|
assert unicodedata.script(unichr(0x16F5D)) == 'Plrd'
|
||||||
assert unicodedata.script(unichr(0x1160B)) == 'Modi'
|
assert unicodedata.script(unichr(0x1160B)) == 'Modi'
|
||||||
assert unicodedata.script(unichr(0x18A8)) == 'Mongolian'
|
assert unicodedata.script(unichr(0x18A8)) == 'Mong'
|
||||||
assert unicodedata.script(unichr(0x16A48)) == 'Mro'
|
assert unicodedata.script(unichr(0x16A48)) == 'Mroo'
|
||||||
assert unicodedata.script(unichr(0x1128C)) == 'Multani'
|
assert unicodedata.script(unichr(0x1128C)) == 'Mult'
|
||||||
assert unicodedata.script(unichr(0x105B)) == 'Myanmar'
|
assert unicodedata.script(unichr(0x105B)) == 'Mymr'
|
||||||
assert unicodedata.script(unichr(0x108AF)) == 'Nabataean'
|
assert unicodedata.script(unichr(0x108AF)) == 'Nbat'
|
||||||
assert unicodedata.script(unichr(0x19B3)) == 'New_Tai_Lue'
|
assert unicodedata.script(unichr(0x19B3)) == 'Talu'
|
||||||
assert unicodedata.script(unichr(0x1143D)) == 'Newa'
|
assert unicodedata.script(unichr(0x1143D)) == 'Newa'
|
||||||
assert unicodedata.script(unichr(0x07F4)) == 'Nko'
|
assert unicodedata.script(unichr(0x07F4)) == 'Nkoo'
|
||||||
assert unicodedata.script(unichr(0x1B192)) == 'Nushu'
|
assert unicodedata.script(unichr(0x1B192)) == 'Nshu'
|
||||||
assert unicodedata.script(unichr(0x169C)) == 'Ogham'
|
assert unicodedata.script(unichr(0x169C)) == 'Ogam'
|
||||||
assert unicodedata.script(unichr(0x1C56)) == 'Ol_Chiki'
|
assert unicodedata.script(unichr(0x1C56)) == 'Olck'
|
||||||
assert unicodedata.script(unichr(0x10CE9)) == 'Old_Hungarian'
|
assert unicodedata.script(unichr(0x10CE9)) == 'Hung'
|
||||||
assert unicodedata.script(unichr(0x10316)) == 'Old_Italic'
|
assert unicodedata.script(unichr(0x10316)) == 'Ital'
|
||||||
assert unicodedata.script(unichr(0x10A93)) == 'Old_North_Arabian'
|
assert unicodedata.script(unichr(0x10A93)) == 'Narb'
|
||||||
assert unicodedata.script(unichr(0x1035A)) == 'Old_Permic'
|
assert unicodedata.script(unichr(0x1035A)) == 'Perm'
|
||||||
assert unicodedata.script(unichr(0x103D5)) == 'Old_Persian'
|
assert unicodedata.script(unichr(0x103D5)) == 'Xpeo'
|
||||||
assert unicodedata.script(unichr(0x10A65)) == 'Old_South_Arabian'
|
assert unicodedata.script(unichr(0x10A65)) == 'Sarb'
|
||||||
assert unicodedata.script(unichr(0x10C09)) == 'Old_Turkic'
|
assert unicodedata.script(unichr(0x10C09)) == 'Orkh'
|
||||||
assert unicodedata.script(unichr(0x0B60)) == 'Oriya'
|
assert unicodedata.script(unichr(0x0B60)) == 'Orya'
|
||||||
assert unicodedata.script(unichr(0x104CF)) == 'Osage'
|
assert unicodedata.script(unichr(0x104CF)) == 'Osge'
|
||||||
assert unicodedata.script(unichr(0x104A8)) == 'Osmanya'
|
assert unicodedata.script(unichr(0x104A8)) == 'Osma'
|
||||||
assert unicodedata.script(unichr(0x16B12)) == 'Pahawh_Hmong'
|
assert unicodedata.script(unichr(0x16B12)) == 'Hmng'
|
||||||
assert unicodedata.script(unichr(0x10879)) == 'Palmyrene'
|
assert unicodedata.script(unichr(0x10879)) == 'Palm'
|
||||||
assert unicodedata.script(unichr(0x11AF1)) == 'Pau_Cin_Hau'
|
assert unicodedata.script(unichr(0x11AF1)) == 'Pauc'
|
||||||
assert unicodedata.script(unichr(0xA869)) == 'Phags_Pa'
|
assert unicodedata.script(unichr(0xA869)) == 'Phag'
|
||||||
assert unicodedata.script(unichr(0x10909)) == 'Phoenician'
|
assert unicodedata.script(unichr(0x10909)) == 'Phnx'
|
||||||
assert unicodedata.script(unichr(0x10B81)) == 'Psalter_Pahlavi'
|
assert unicodedata.script(unichr(0x10B81)) == 'Phlp'
|
||||||
assert unicodedata.script(unichr(0xA941)) == 'Rejang'
|
assert unicodedata.script(unichr(0xA941)) == 'Rjng'
|
||||||
assert unicodedata.script(unichr(0x16C3)) == 'Runic'
|
assert unicodedata.script(unichr(0x16C3)) == 'Runr'
|
||||||
assert unicodedata.script(unichr(0x0814)) == 'Samaritan'
|
assert unicodedata.script(unichr(0x0814)) == 'Samr'
|
||||||
assert unicodedata.script(unichr(0xA88C)) == 'Saurashtra'
|
assert unicodedata.script(unichr(0xA88C)) == 'Saur'
|
||||||
assert unicodedata.script(unichr(0x111C8)) == 'Sharada'
|
assert unicodedata.script(unichr(0x111C8)) == 'Shrd'
|
||||||
assert unicodedata.script(unichr(0x1045F)) == 'Shavian'
|
assert unicodedata.script(unichr(0x1045F)) == 'Shaw'
|
||||||
assert unicodedata.script(unichr(0x115AD)) == 'Siddham'
|
assert unicodedata.script(unichr(0x115AD)) == 'Sidd'
|
||||||
assert unicodedata.script(unichr(0x1D8C0)) == 'SignWriting'
|
assert unicodedata.script(unichr(0x1D8C0)) == 'Sgnw'
|
||||||
assert unicodedata.script(unichr(0x0DB9)) == 'Sinhala'
|
assert unicodedata.script(unichr(0x0DB9)) == 'Sinh'
|
||||||
assert unicodedata.script(unichr(0x110F9)) == 'Sora_Sompeng'
|
assert unicodedata.script(unichr(0x110F9)) == 'Sora'
|
||||||
assert unicodedata.script(unichr(0x11A60)) == 'Soyombo'
|
assert unicodedata.script(unichr(0x11A60)) == 'Soyo'
|
||||||
assert unicodedata.script(unichr(0x1B94)) == 'Sundanese'
|
assert unicodedata.script(unichr(0x1B94)) == 'Sund'
|
||||||
assert unicodedata.script(unichr(0xA81F)) == 'Syloti_Nagri'
|
assert unicodedata.script(unichr(0xA81F)) == 'Sylo'
|
||||||
assert unicodedata.script(unichr(0x0740)) == 'Syriac'
|
assert unicodedata.script(unichr(0x0740)) == 'Syrc'
|
||||||
assert unicodedata.script(unichr(0x1714)) == 'Tagalog'
|
assert unicodedata.script(unichr(0x1714)) == 'Tglg'
|
||||||
assert unicodedata.script(unichr(0x1761)) == 'Tagbanwa'
|
assert unicodedata.script(unichr(0x1761)) == 'Tagb'
|
||||||
assert unicodedata.script(unichr(0x1965)) == 'Tai_Le'
|
assert unicodedata.script(unichr(0x1965)) == 'Tale'
|
||||||
assert unicodedata.script(unichr(0x1A32)) == 'Tai_Tham'
|
assert unicodedata.script(unichr(0x1A32)) == 'Lana'
|
||||||
assert unicodedata.script(unichr(0xAA86)) == 'Tai_Viet'
|
assert unicodedata.script(unichr(0xAA86)) == 'Tavt'
|
||||||
assert unicodedata.script(unichr(0x116A5)) == 'Takri'
|
assert unicodedata.script(unichr(0x116A5)) == 'Takr'
|
||||||
assert unicodedata.script(unichr(0x0B8E)) == 'Tamil'
|
assert unicodedata.script(unichr(0x0B8E)) == 'Taml'
|
||||||
assert unicodedata.script(unichr(0x1754D)) == 'Tangut'
|
assert unicodedata.script(unichr(0x1754D)) == 'Tang'
|
||||||
assert unicodedata.script(unichr(0x0C40)) == 'Telugu'
|
assert unicodedata.script(unichr(0x0C40)) == 'Telu'
|
||||||
assert unicodedata.script(unichr(0x07A4)) == 'Thaana'
|
assert unicodedata.script(unichr(0x07A4)) == 'Thaa'
|
||||||
assert unicodedata.script(unichr(0x0E42)) == 'Thai'
|
assert unicodedata.script(unichr(0x0E42)) == 'Thai'
|
||||||
assert unicodedata.script(unichr(0x0F09)) == 'Tibetan'
|
assert unicodedata.script(unichr(0x0F09)) == 'Tibt'
|
||||||
assert unicodedata.script(unichr(0x2D3A)) == 'Tifinagh'
|
assert unicodedata.script(unichr(0x2D3A)) == 'Tfng'
|
||||||
assert unicodedata.script(unichr(0x114B0)) == 'Tirhuta'
|
assert unicodedata.script(unichr(0x114B0)) == 'Tirh'
|
||||||
assert unicodedata.script(unichr(0x1038B)) == 'Ugaritic'
|
assert unicodedata.script(unichr(0x1038B)) == 'Ugar'
|
||||||
assert unicodedata.script(unichr(0xA585)) == 'Vai'
|
assert unicodedata.script(unichr(0xA585)) == 'Vaii'
|
||||||
assert unicodedata.script(unichr(0x118CF)) == 'Warang_Citi'
|
assert unicodedata.script(unichr(0x118CF)) == 'Wara'
|
||||||
assert unicodedata.script(unichr(0xA066)) == 'Yi'
|
assert unicodedata.script(unichr(0xA066)) == 'Yiii'
|
||||||
assert unicodedata.script(unichr(0x11A31)) == 'Zanabazar_Square'
|
assert unicodedata.script(unichr(0x11A31)) == 'Zanb'
|
||||||
|
|
||||||
|
|
||||||
def test_script_extension():
|
def test_script_extension():
|
||||||
assert unicodedata.script_extension("a") == {"Latin"}
|
assert unicodedata.script_extension("a") == {"Latn"}
|
||||||
assert unicodedata.script_extension(unichr(0)) == {"Common"}
|
assert unicodedata.script_extension(unichr(0)) == {"Zyyy"}
|
||||||
assert unicodedata.script_extension(unichr(0x0378)) == {"Unknown"}
|
assert unicodedata.script_extension(unichr(0x0378)) == {"Zzzz"}
|
||||||
assert unicodedata.script_extension(unichr(0x10FFFF)) == {"Unknown"}
|
assert unicodedata.script_extension(unichr(0x10FFFF)) == {"Zzzz"}
|
||||||
|
|
||||||
assert unicodedata.script_extension("\u0660") == {'Arab', 'Thaa'}
|
assert unicodedata.script_extension("\u0660") == {'Arab', 'Thaa'}
|
||||||
assert unicodedata.script_extension("\u0964") == {
|
assert unicodedata.script_extension("\u0964") == {
|
||||||
@ -167,8 +169,40 @@ def test_script_extension():
|
|||||||
'Orya', 'Sind', 'Sinh', 'Sylo', 'Takr', 'Taml', 'Telu', 'Tirh'}
|
'Orya', 'Sind', 'Sinh', 'Sylo', 'Takr', 'Taml', 'Telu', 'Tirh'}
|
||||||
|
|
||||||
|
|
||||||
|
def test_script_name():
|
||||||
|
assert unicodedata.script_name("Latn") == "Latin"
|
||||||
|
assert unicodedata.script_name("Zyyy") == "Common"
|
||||||
|
assert unicodedata.script_name("Zzzz") == "Unknown"
|
||||||
|
# underscores in long names are replaced by spaces
|
||||||
|
assert unicodedata.script_name("Egyp") == "Egyptian Hieroglyphs"
|
||||||
|
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
unicodedata.script_name("QQQQ")
|
||||||
|
assert unicodedata.script_name("QQQQ", default="Unknown")
|
||||||
|
|
||||||
|
|
||||||
|
def test_script_code():
|
||||||
|
assert unicodedata.script_code("Latin") == "Latn"
|
||||||
|
assert unicodedata.script_code("Common") == "Zyyy"
|
||||||
|
assert unicodedata.script_code("Unknown") == "Zzzz"
|
||||||
|
# case, whitespace, underscores and hyphens are ignored
|
||||||
|
assert unicodedata.script_code("Egyptian Hieroglyphs") == "Egyp"
|
||||||
|
assert unicodedata.script_code("Egyptian_Hieroglyphs") == "Egyp"
|
||||||
|
assert unicodedata.script_code("egyptianhieroglyphs") == "Egyp"
|
||||||
|
assert unicodedata.script_code("Egyptian-Hieroglyphs") == "Egyp"
|
||||||
|
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
unicodedata.script_code("Does not exist")
|
||||||
|
assert unicodedata.script_code("Does not exist", default="Zzzz") == "Zzzz"
|
||||||
|
|
||||||
|
|
||||||
def test_block():
|
def test_block():
|
||||||
assert unicodedata.block("\x00") == "Basic Latin"
|
assert unicodedata.block("\x00") == "Basic Latin"
|
||||||
assert unicodedata.block("\x7F") == "Basic Latin"
|
assert unicodedata.block("\x7F") == "Basic Latin"
|
||||||
assert unicodedata.block("\x80") == "Latin-1 Supplement"
|
assert unicodedata.block("\x80") == "Latin-1 Supplement"
|
||||||
assert unicodedata.block("\u1c90") == "No_Block"
|
assert unicodedata.block("\u1c90") == "No_Block"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
sys.exit(pytest.main(sys.argv))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user