[unicodedata] add script_code func and 'default' fallback arg

`script_code` does the reverse of `script_name`: it takes a long script name and returns a 4-letter script code. Both `script_name` and `script_code` raise KeyError by default, but can optionally return a default value instead.
2017-11-22 17:34:24 +01:00 · 2017-11-22 17:34:24 +01:00 · 99ea0a3986
commit 99ea0a3986
parent afd2490a6c
2 changed files with 58 additions and 3 deletions
--- a/Lib/fontTools/unicodedata/init.py
+++ b/Lib/fontTools/unicodedata/init.py
@ -2,6 +2,7 @@ from __future__ import (
    print_function, division, absolute_import, unicode_literals)
 from fontTools.misc.py23 import *

+import re
 from bisect import bisect_right

 try:
@ -84,13 +85,50 @@ def script_extension(char):
    return value


-def script_name(code):
+def script_name(code, default=KeyError):
    """ Return the long, human-readable script name given a four-letter
    Unicode script code.

-    Raises KeyError if no matching name is found.
+    If no matching name is found, a KeyError is raised by default.
+
+    You can use the 'default' argument to return a fallback value (e.g.
+    'Unknown' or None) instead of throwing an error.
    """
-    return Scripts.NAMES[code].replace("_", " ")
+    try:
+        return str(Scripts.NAMES[code].replace("_", " "))
+    except KeyError:
+        if isinstance(default, type) and issubclass(default, KeyError):
+            raise
+        return default
+
+
+_normalize_re = re.compile(r"[-_ ]+")
+
+
+def _normalize_property_name(string):
+    """Remove case, strip space, '-' and '_' for loose matching."""
+    return _normalize_re.sub("", string).lower()
+
+
+_SCRIPT_CODES = {_normalize_property_name(v): k
+                 for k, v in Scripts.NAMES.items()}
+
+
+def script_code(script_name, default=KeyError):
+    """Returns the four-letter Unicode script code from its long name
+
+    If no matching script code is found, a KeyError is raised by default.
+
+    You can use the 'default' argument to return a fallback string (e.g.
+    'Zzzz' or None) instead of throwing an error.
+    """
+    normalized_name = _normalize_property_name(script_name)
+    try:
+        return _SCRIPT_CODES[normalized_name]
+    except KeyError:
+        if isinstance(default, type) and issubclass(default, KeyError):
+            raise
+        return default


 def block(char):
--- a/Tests/unicodedata_test.py
+++ b/Tests/unicodedata_test.py
@ -175,8 +175,25 @@ def test_script_name():
    assert unicodedata.script_name("Zzzz") == "Unknown"
    # underscores in long names are replaced by spaces
    assert unicodedata.script_name("Egyp") == "Egyptian Hieroglyphs"
+
    with pytest.raises(KeyError):
        unicodedata.script_name("QQQQ")
+    assert unicodedata.script_name("QQQQ", default="Unknown")
+
+
+def test_script_code():
+    assert unicodedata.script_code("Latin") == "Latn"
+    assert unicodedata.script_code("Common") == "Zyyy"
+    assert unicodedata.script_code("Unknown") == "Zzzz"
+    # case, whitespace, underscores and hyphens are ignored
+    assert unicodedata.script_code("Egyptian Hieroglyphs") == "Egyp"
+    assert unicodedata.script_code("Egyptian_Hieroglyphs") == "Egyp"
+    assert unicodedata.script_code("egyptianhieroglyphs") == "Egyp"
+    assert unicodedata.script_code("Egyptian-Hieroglyphs") == "Egyp"
+
+    with pytest.raises(KeyError):
+        unicodedata.script_code("Does not exist")
+    assert unicodedata.script_code("Does not exist", default="Zzzz") == "Zzzz"


 def test_block():