[unicodedata] Fix bug in buildUCD.py

The method that was being used to read the header from the input was inadvertently dropping the first non-header line on the floor; although this happens to be okay in some cases (where there is an empty line after the header) in the case of newer versions of the ScriptExtensions.txt file, this was causing the generated code to be missing the first entry, for U+00B7 (MIDDLE DOT)
2025-01-28 19:03:43 -05:00 · 2025-01-28 19:03:43 -05:00 · b26271cc4d
commit b26271cc4d
parent 6bc5d7f887
5 changed files with 73 additions and 29 deletions
--- a/Lib/fontTools/unicodedata/Blocks.py
+++ b/Lib/fontTools/unicodedata/Blocks.py
@ -16,7 +16,6 @@
 # Format:
 # Start Code..End Code; Block Name

-
 RANGES = [
    0x0000,  # .. 0x007F ; Basic Latin
    0x0080,  # .. 0x00FF ; Latin-1 Supplement
--- a/Lib/fontTools/unicodedata/ScriptExtensions.py
+++ b/Lib/fontTools/unicodedata/ScriptExtensions.py
@ -32,9 +32,10 @@
 #
 # @missing: 0000..10FFFF; <script>

-
 RANGES = [
-    0x0000,  # .. 0x02BB ; None
+    0x0000,  # .. 0x00B6 ; None
+    0x00B7,  # .. 0x00B7 ; {'Avst', 'Cari', 'Copt', 'Dupl', 'Elba', 'Geor', 'Glag', 'Gong', 'Goth', 'Grek', 'Hani', 'Latn', 'Lydi', 'Mahj', 'Perm', 'Shaw'}
+    0x00B8,  # .. 0x02BB ; None
    0x02BC,  # .. 0x02BC ; {'Beng', 'Cyrl', 'Deva', 'Latn', 'Lisu', 'Thai', 'Toto'}
    0x02BD,  # .. 0x02C6 ; None
    0x02C7,  # .. 0x02C7 ; {'Bopo', 'Latn'}
@ -316,7 +317,26 @@ RANGES = [
 ]

 VALUES = [
-    None,  # 0000..02BB
+    None,  # 0000..00B6
+    {
+        "Avst",
+        "Cari",
+        "Copt",
+        "Dupl",
+        "Elba",
+        "Geor",
+        "Glag",
+        "Gong",
+        "Goth",
+        "Grek",
+        "Hani",
+        "Latn",
+        "Lydi",
+        "Mahj",
+        "Perm",
+        "Shaw",
+    },  # 00B7..00B7
+    None,  # 00B8..02BB
    {"Beng", "Cyrl", "Deva", "Latn", "Lisu", "Thai", "Toto"},  # 02BC..02BC
    None,  # 02BD..02C6
    {"Bopo", "Latn"},  # 02C7..02C7
--- a/Lib/fontTools/unicodedata/Scripts.py
+++ b/Lib/fontTools/unicodedata/Scripts.py
@ -19,7 +19,6 @@
 #       https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
 #

-
 RANGES = [
    0x0000,  # .. 0x0040 ; Common
    0x0041,  # .. 0x005A ; Latin
--- a/MetaTools/buildUCD.py
+++ b/MetaTools/buildUCD.py
@ -8,13 +8,12 @@ try:
    from urllib.request import urlopen
 except ImportError:
    from urllib2 import urlopen
-from contextlib import closing, contextmanager
 import re
-from codecs import iterdecode
 import logging
 import os
 from io import open
 from os.path import abspath, dirname, join as pjoin, pardir, sep
+from typing import List


 try:  # pragma: no cover
@ -40,27 +39,26 @@ MAX_UNICODE = 0x10FFFF
 log = logging.getLogger()


-@contextmanager
-def open_unidata_file(filename):
+def open_unidata_file(filename) -> List[str]:
    """Open a text file from https://unicode.org/Public/UNIDATA/"""
    url = UNIDATA_URL + filename
-    with closing(urlopen(url)) as response:
-        yield iterdecode(response, encoding="utf-8")
+    return urlopen(url).read().decode("utf-8").splitlines()


-def parse_unidata_header(infile):
+def parse_unidata_header(file_lines: List[str]):
    """Read the top header of data files, until the first line
    that does not start with '#'.
    """
    header = []
-    line = next(infile)
-    while line.startswith("#"):
-        header.append(line)
-        line = next(infile)
-    return "".join(header)
+    for line in file_lines:
+        if line.startswith("#"):
+            header.append(line)
+        else:
+            break
+    return "\n".join(header)


-def parse_range_properties(infile, default=None, is_set=False):
+def parse_range_properties(infile: List[str], default=None, is_set=False):
    """Parse a Unicode data file containing a column with one character or
    a range of characters, and another column containing a property value
    separated by a semicolon. Comments after '#' are ignored.
@ -180,14 +178,15 @@ def build_ranges(

    if local_ucd:
        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
-        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
+        file_lines = [
+            l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
+        ]
    else:
        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
-        cm = open_unidata_file(filename)
+        file_lines = open_unidata_file(filename)

-    with cm as f:
-        header = parse_unidata_header(f)
-        ranges = parse_range_properties(f, default=default, is_set=is_set)
+    header = parse_unidata_header(file_lines)
+    ranges = parse_range_properties(file_lines, default=default, is_set=is_set)

    if aliases:
        reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
@ -260,14 +259,14 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
    filename = "PropertyValueAliases.txt"
    if local_ucd:
        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
-        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
+        file_lines = [
+            l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
+        ]
    else:
        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
-        cm = open_unidata_file(filename)
-
-    with cm as f:
-        header = parse_unidata_header(f)
-        data = parse_semicolon_separated_data(f)
+        file_lines = open_unidata_file(filename)
+    header = parse_unidata_header(file_lines)
+    data = parse_semicolon_separated_data(file_lines)

    aliases = {item[1]: item[2:] for item in data if item[0] == property_tag}

--- a/Tests/unicodedata_test.py
+++ b/Tests/unicodedata_test.py
@ -155,6 +155,33 @@ def test_script():


 def test_script_extension():
+    assert unicodedata.script_extension("\u00B7") == {
+        "Avst",
+        "Cari",
+        "Copt",
+        "Dupl",
+        "Elba",
+        "Geor",
+        "Glag",
+        "Gong",
+        "Goth",
+        "Grek",
+        "Hani",
+        "Latn",
+        "Lydi",
+        "Mahj",
+        "Perm",
+        "Shaw",
+    }
+    assert unicodedata.script_extension("\u02BC") == {
+        "Beng",
+        "Cyrl",
+        "Deva",
+        "Latn",
+        "Lisu",
+        "Thai",
+        "Toto",
+    }
    assert unicodedata.script_extension("a") == {"Latn"}
    assert unicodedata.script_extension(chr(0)) == {"Zyyy"}
    assert unicodedata.script_extension(chr(0x0378)) == {"Zzzz"}