Merge pull request #3756 from cmyr/unicode-data-fixup

[unicodedata] Fix bug in buildUCD.py
2025-01-29 17:35:33 +00:00 · 2025-01-29 17:35:33 +00:00 · eece3c1b62
commit eece3c1b62
parent b7509b24a9 b26271cc4d
5 changed files with 73 additions and 29 deletions
--- a/Lib/fontTools/unicodedata/Blocks.py
+++ b/Lib/fontTools/unicodedata/Blocks.py
@ -16,7 +16,6 @@
 # Format:
 # Start Code..End Code; Block Name

-
 RANGES = [
    0x0000,  # .. 0x007F ; Basic Latin
    0x0080,  # .. 0x00FF ; Latin-1 Supplement
--- a/Lib/fontTools/unicodedata/ScriptExtensions.py
+++ b/Lib/fontTools/unicodedata/ScriptExtensions.py
@ -32,9 +32,10 @@
 #
 # @missing: 0000..10FFFF; <script>

-
 RANGES = [
-    0x0000,  # .. 0x02BB ; None
+    0x0000,  # .. 0x00B6 ; None
+    0x00B7,  # .. 0x00B7 ; {'Avst', 'Cari', 'Copt', 'Dupl', 'Elba', 'Geor', 'Glag', 'Gong', 'Goth', 'Grek', 'Hani', 'Latn', 'Lydi', 'Mahj', 'Perm', 'Shaw'}
+    0x00B8,  # .. 0x02BB ; None
    0x02BC,  # .. 0x02BC ; {'Beng', 'Cyrl', 'Deva', 'Latn', 'Lisu', 'Thai', 'Toto'}
    0x02BD,  # .. 0x02C6 ; None
    0x02C7,  # .. 0x02C7 ; {'Bopo', 'Latn'}
@ -316,7 +317,26 @@ RANGES = [
 ]

 VALUES = [
-    None,  # 0000..02BB
+    None,  # 0000..00B6
+    {
+        "Avst",
+        "Cari",
+        "Copt",
+        "Dupl",
+        "Elba",
+        "Geor",
+        "Glag",
+        "Gong",
+        "Goth",
+        "Grek",
+        "Hani",
+        "Latn",
+        "Lydi",
+        "Mahj",
+        "Perm",
+        "Shaw",
+    },  # 00B7..00B7
+    None,  # 00B8..02BB
    {"Beng", "Cyrl", "Deva", "Latn", "Lisu", "Thai", "Toto"},  # 02BC..02BC
    None,  # 02BD..02C6
    {"Bopo", "Latn"},  # 02C7..02C7
--- a/Lib/fontTools/unicodedata/Scripts.py
+++ b/Lib/fontTools/unicodedata/Scripts.py
@ -19,7 +19,6 @@
 #       https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
 #

-
 RANGES = [
    0x0000,  # .. 0x0040 ; Common
    0x0041,  # .. 0x005A ; Latin
--- a/MetaTools/buildUCD.py
+++ b/MetaTools/buildUCD.py
@ -8,13 +8,12 @@ try:
    from urllib.request import urlopen
 except ImportError:
    from urllib2 import urlopen
-from contextlib import closing, contextmanager
 import re
-from codecs import iterdecode
 import logging
 import os
 from io import open
 from os.path import abspath, dirname, join as pjoin, pardir, sep
+from typing import List


 try:  # pragma: no cover
@ -40,27 +39,26 @@ MAX_UNICODE = 0x10FFFF
 log = logging.getLogger()


-@contextmanager
-def open_unidata_file(filename):
+def open_unidata_file(filename) -> List[str]:
    """Open a text file from https://unicode.org/Public/UNIDATA/"""
    url = UNIDATA_URL + filename
-    with closing(urlopen(url)) as response:
-        yield iterdecode(response, encoding="utf-8")
+    return urlopen(url).read().decode("utf-8").splitlines()


-def parse_unidata_header(infile):
+def parse_unidata_header(file_lines: List[str]):
    """Read the top header of data files, until the first line
    that does not start with '#'.
    """
    header = []
-    line = next(infile)
-    while line.startswith("#"):
-        header.append(line)
-        line = next(infile)
-    return "".join(header)
+    for line in file_lines:
+        if line.startswith("#"):
+            header.append(line)
+        else:
+            break
+    return "\n".join(header)


-def parse_range_properties(infile, default=None, is_set=False):
+def parse_range_properties(infile: List[str], default=None, is_set=False):
    """Parse a Unicode data file containing a column with one character or
    a range of characters, and another column containing a property value
    separated by a semicolon. Comments after '#' are ignored.
@ -180,14 +178,15 @@ def build_ranges(

    if local_ucd:
        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
-        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
+        file_lines = [
+            l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
+        ]
    else:
        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
-        cm = open_unidata_file(filename)
+        file_lines = open_unidata_file(filename)

-    with cm as f:
-        header = parse_unidata_header(f)
-        ranges = parse_range_properties(f, default=default, is_set=is_set)
+    header = parse_unidata_header(file_lines)
+    ranges = parse_range_properties(file_lines, default=default, is_set=is_set)

    if aliases:
        reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
@ -260,14 +259,14 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
    filename = "PropertyValueAliases.txt"
    if local_ucd:
        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
-        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
+        file_lines = [
+            l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
+        ]
    else:
        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
-        cm = open_unidata_file(filename)
-
-    with cm as f:
-        header = parse_unidata_header(f)
-        data = parse_semicolon_separated_data(f)
+        file_lines = open_unidata_file(filename)
+    header = parse_unidata_header(file_lines)
+    data = parse_semicolon_separated_data(file_lines)

    aliases = {item[1]: item[2:] for item in data if item[0] == property_tag}

--- a/Tests/unicodedata_test.py
+++ b/Tests/unicodedata_test.py
@ -155,6 +155,33 @@ def test_script():


 def test_script_extension():
+    assert unicodedata.script_extension("\u00B7") == {
+        "Avst",
+        "Cari",
+        "Copt",
+        "Dupl",
+        "Elba",
+        "Geor",
+        "Glag",
+        "Gong",
+        "Goth",
+        "Grek",
+        "Hani",
+        "Latn",
+        "Lydi",
+        "Mahj",
+        "Perm",
+        "Shaw",
+    }
+    assert unicodedata.script_extension("\u02BC") == {
+        "Beng",
+        "Cyrl",
+        "Deva",
+        "Latn",
+        "Lisu",
+        "Thai",
+        "Toto",
+    }
    assert unicodedata.script_extension("a") == {"Latn"}
    assert unicodedata.script_extension(chr(0)) == {"Zyyy"}
    assert unicodedata.script_extension(chr(0x0378)) == {"Zzzz"}